Ejemplo n.º 1
0
    def watch_twitter(self, account: dict) -> None:
        current_data = Profile(account["username"]).to_dict()
        del current_data["username"]

        # change 'likes_count' to 'likes'
        for key, value in current_data.copy().items():
            if key.endswith("_count"):
                current_data[key.split("_")[0]] = value
                del current_data[key]

        last_data = (models.Twitter.select().where(
            models.Twitter.username == account["username"]).order_by(
                -models.Twitter.timestamp).limit(1))

        if last_data:
            _changed_datas = self.__compare_datas(
                current_data, last_data[0], account.get("ignored_metrics", []))

            if _changed_datas:
                table = self.__create_table("twitter", account["username"],
                                            _changed_datas)
                self.send_message(f"```\n {table} \n```")

            elif not account.get("only_if_changed", True):
                self.send_message(
                    f"Nothing changed for twitter/**{account['username']}**")

        with self.lock:
            models.Twitter.create(username=account["username"],
                                  data=current_data)

        time.sleep(account.get("interval", self.default_interval))
        self.watch_twitter(account)
def crawler_item(browser, user_name: str, media_id: int, media_name: str, mt,
                 xpath):
    """ 抓取单个账号用户信息

    :param user_name: <str> 账号名称
    :param media_id: <int> 媒体ID
    :param media_name: <str> 媒体名称
    :return: <None> 已将结果存入数据库
    """
    # 使用twitter-scraper包抓取账户信息(关注数+正在关注数可能错误)
    try:
        profile = Profile(user_name)
    except:
        print("账号不存在!")
        return
    writing_item = profile.to_dict()
    writing_item["media_id"] = media_id
    writing_item["media_name"] = media_name

    # 抓取账户粉丝数和正在关注数(Selenium爬虫)
    browser.get("https://twitter.com/" + user_name)
    time.sleep(tool.get_scope_random(12))
    following_count = None
    followers_count = None
    try:
        following_count = browser.find_element_by_xpath(
            xpath["following_count"][0]).get_attribute("title")
        followers_count = browser.find_element_by_xpath(
            xpath["followers_count"][0]).get_attribute("title")
    except:
        try:
            following_count = browser.find_element_by_xpath(
                xpath["following_count"][1]).get_attribute("title")
            followers_count = browser.find_element_by_xpath(
                xpath["followers_count"][1]).get_attribute("title")
        except:
            print("Selenium抓取关注数+正在关注失败!")

    # 依据Selenium爬虫结果修正抓取结果
    if following_count is not None:
        following_count = following_count.replace(",", "")
        print("修正正在关注数量:", writing_item["following_count"], "→",
              following_count)
        writing_item["following_count"] = following_count
    if followers_count is not None:
        followers_count = followers_count.replace(",", "")
        print("修正关注者数量:", writing_item["followers_count"], "→",
              followers_count)
        writing_item["followers_count"] = followers_count

    # 将数据写入到数据库
    writing_list = list()
    writing_list.append(writing_item)
    write_num = mysql.insert_pure(mt, writing_list)
    print("存储记录数:", write_num)
    print(writing_list)
Ejemplo n.º 3
0
def get_user_data(username: str) -> dict:
    """Gets user data.

    Args:
        username (str): username of tweet.

    Returns:
        dict: dictionary containing user info.
    """
    profile = Profile(username=username)
    return profile.to_dict()
Ejemplo n.º 4
0
def twitter_scraper(username):
    try:
        profile = Profile(username)
        data = profile.to_dict()
        data['pfp_url'] = data.pop('profile_photo')
        data['website'] = 'https://' + data['website'] if data[
            'website'] else ''
        data['url'] = f'https://twitter.com/{username}'
        data.pop('username')
        return data
    except:
        return "User not found"
Ejemplo n.º 5
0
def crawler(driver, user_name: str, template):
    """
    抓取Twitter用户信息
    填写数据模板中的name、username、birthday、biography、website、profile_photo、likes_count、tweets_count、followers_count、following_count属性

    :param driver: <selenium.webdriver.chrome.webdriver.WebDriver> Chrome浏览器对象
    :param user_name: <str> Twitter用户名
    :param template: <dict> 返回值数据模板
    :return: <dict> 填写抓取数据的数据模板
    """
    # 使用twitter-scraper包抓取账户信息(关注数+正在关注数可能错误)
    try:
        profile = Profile(user_name).to_dict()
    except:
        print("账号不存在!")
        return

    print(profile)

    for key, value in profile.items():
        template[key] = value

    # 抓取账户粉丝数和正在关注数(Selenium爬虫)
    driver.get("https://twitter.com/" + user_name)
    time.sleep(tool.get_scope_random(12))
    try:
        following_count = tool.fetch.number(
            driver.find_element_by_xpath(XPATH_FOLLOWING_COUNT[0]).text)
        followers_count = tool.fetch.number(
            driver.find_element_by_xpath(XPATH_FOLLOWERS_COUNT[0]).text)
    except:
        try:
            following_count = tool.fetch.number(
                driver.find_element_by_xpath(XPATH_FOLLOWING_COUNT[1]).text)
            followers_count = tool.fetch.number(
                driver.find_element_by_xpath(XPATH_FOLLOWERS_COUNT[1]).text)
        except:
            print("Selenium抓取关注数+正在关注失败!")
            return template

    # 依据Selenium爬虫结果修正抓取结果
    if abs(template["following_count"] - following_count) > 1000:
        print("修正正在关注数量:", template["following_count"], "→", following_count)
        template["following_count"] = following_count
    if abs(template["followers_count"] - followers_count) > 1000:
        print("修正关注者数量:", template["followers_count"], "→", followers_count)
        template["followers_count"] = followers_count

    return template
Ejemplo n.º 6
0
def searcher(hashes, pgcount):
    #Each page has 20 tweets
    global df
    for tweet in get_tweets(hashes, pages=pgcount):
        dic = {}
        dic['Link'] = "twitter.com/anyuser/status/" + tweet['tweetId']
        dic['Retweet'] = tweet['isRetweet']
        dic['Text'] = tweet['text']
        dic['Time'] = str(tweet['time'])
        dic['Replies'] = tweet['replies']
        dic['Retweets'] = tweet['retweets']
        dic['Likes'] = tweet['likes']
        dic['Hashtags'] = ''.join(tweet['entries']['hashtags'])
        dic['Photos'] = ''.join(tweet['entries']['photos'])
        dic['Urls'] = ''.join(tweet['entries']['urls'])
        dic['Videos'] = str(tweet['entries']['videos'])
        page = urlopen("https://twitter.com/anyuser/status/" +
                       tweet['tweetId'])
        uname = page.geturl()[20:].split("/", 1)[0]
        dic['Username'] = uname
        profile = Profile(uname)
        dic['P_Location'] = profile.location
        dic['P_Name'] = profile.name
        dic['P_Followers'] = profile.followers_count
        dic['P_Photo'] = profile.profile_photo

        df = df.append(dic, ignore_index=True)
Ejemplo n.º 7
0
def getTopTweets(username, startDate, endDate, noTopTweets=5):
    # Scrap Tweets
    tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(
        startDate).setUntil(endDate)
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)

    df = pd.DataFrame([tweet.__dict__ for tweet in tweets])
    originalDF = df

    df = df.drop(['author_id', 'date', 'geo', 'username'], axis=1)

    # Get keyword count
    df['keywordCount'] = df.apply(lambda x: getkeywordCount(x["text"]), axis=1)

    # Normalize counts
    for feature in ["retweets", "favorites", "replies", 'keywordCount']:
        df[feature + "NormalisedCount"] = df[feature] / (
            df[feature].max())  #-df[feature].min())

    # ## Find out if the tweet is a retweet to some other persons tweet
    # #### We do this because retweeted tweets get a far greater reach and thus skews the scores
    df["ifTo"] = df.apply(lambda x: 0.6 if x["to"] else 1, axis=1)

    ## Get time elapsed since every tweet.
    ### We hypothise that older tweets get more exposure
    df["formatted_date"] = df.apply(
        lambda x: x["formatted_date"][:-10] + x["formatted_date"][-4:], axis=1)
    df["timeElapsedHours"] = df.apply(
        lambda x: (pd.Timestamp(endDate) - pd.to_datetime(x["formatted_date"])
                   ).total_seconds() // 3600,
        axis=1)
    df["timeElapsedScore"] = 1 + 0.04 * (7 * 12 -
                                         df["timeElapsedHours"]) / 7 / 24

    # ## Assign Score to tweet on basis of normalised count of retweets, favorites, replies and keyword count
    df["score"] = (1.5 * df["retweetsNormalisedCount"] +
                   1.2 * df["repliesNormalisedCount"] +
                   1 * df["favoritesNormalisedCount"] + 2 *
                   df["keywordCount"]) * df["ifTo"] * df["timeElapsedScore"]

    df = df.drop([
        'id', 'keywordCount', 'retweetsNormalisedCount',
        'favoritesNormalisedCount', 'repliesNormalisedCount',
        'keywordCountNormalisedCount', 'ifTo', 'timeElapsedHours',
        'timeElapsedScore'
    ],
                 axis=1)

    sortedDF = df.sort_values(by=['score'], inplace=False, ascending=False)

    # Get profile user name and image link
    profile = Profile(username).to_dict()

    profileInformation = {}
    profileInformation["username"] = profile["username"]
    profileInformation["name"] = profile["name"]
    profileInformation["profile_photo"] = profile["profile_photo"]

    return sortedDF.head(noTopTweets), profileInformation
Ejemplo n.º 8
0
    def refresh(
        self, info, username: str = None, user_id: strawberry.ID = None,
    ) -> User:
        if not username and not user_id or username and user_id:
            raise ValueError("You must specify either a username or user id!")

        kwargs = {}
        if username:
            kwargs["username"] = username
        elif user_id:
            kwargs["user_id"] = user_id

        user = models.User.objects(**kwargs).first()

        if not user:
            try:
                twitter_user = Profile(username)
            except IndexError:
                raise Exception("User does not exist!")
            else:
                user = models.User(
                    username=username,
                    user_id=twitter_user.user_id,
                    bio=twitter_user.biography,
                    profile_photo=twitter_user.profile_photo,
                )
        else:
            twitter_user = Profile(username)

        if user.threads:
            queue_type = "low"
        else:
            queue_type = "high"

        q = Queue(queue_type, connection=conn)

        user.status = "Pending"
        user.bio = twitter_user.biography
        user.profile_photo = twitter_user.profile_photo
        user.save()

        q.enqueue(jobs.refresh_user_threads, username, job_timeout="3h")

        return user
Ejemplo n.º 9
0
def get_potential_followers(tag, pages=2):
    pfollowers = []
    for tweets in get_tweets(tag, pages):
        user = Profile(tweets['username'])
        try:
            if user.following_count == user.followers_count or user.following_count >= user.followers_count:
                print(tweets['username'] + " followers_count: " +
                      str(user.followers_count) + " following_count: " +
                      str(user.following_count))
                pfollowers.append(tweets['username'])
        except:
            pass
    return pfollowers
Ejemplo n.º 10
0
def fetch(username):
    values = (('Name', 'name', 0), ('Website', 'website', 0),
              ('Birthday', 'birthday', 0), ('Username', 'username', 0),
              ('Biography', 'biography', 0), ('Private', 'is_private', 0),
              ('Profile Photo', 'profile_photo', 0), ('Banner Photo',
                                                      'banner_photo', 0),
              ('Location', 'location', 0), ('Likes', 'likes_count', 1),
              ('Tweets', 'tweets_count', 1), ('Following', 'following_count',
                                              1), ('Followers',
                                                   'followers_count', 1))

    profile = Profile(username)
    db = json_rw(1)

    str_list = ['birthday', 'is_private', 'website']

    if not db:
        for _, name, __ in values:
            value = getattr(profile, name)
            if name in str_list:
                value = str(value)

            db[name] = value
    else:
        msg = ''
        for template, name, is_diff in values:
            value = getattr(profile, name)

            if name in str_list:
                value = str(value)

            if value == db[name]:
                continue

            if is_diff:
                diff = str(value - db[name])

                if not '-' in diff:
                    diff = f'+{diff}'

                diff = f'{value} ({diff})'

            db[name] = value

            msg += f'`{template}:` {diff}\n' if is_diff else f'`{template}:` {value}\n'

        if msg:
            telegram(msg)

    json_rw(0, db)
Ejemplo n.º 11
0
def handle_search(username_):
    #comparing if it avaiable
    a = ""
    background = ""
    padding = 50
    try:
        profile = Profile(str(username_))
        a = "Not Avaiable"
        background = "red"
    except:
        a = "Avaiable"
        background = "green"
        padding = 65

    display = Label(root, text=a, bg=background, padx=padding)
    display.grid(row=4, column=1)
Ejemplo n.º 12
0
def add_user_twitter_scraper(username):
    """Add a user and their tweets to database."""
    try:
        # Get user profile
        user_profile = Profile(username)

        # Add to User table (or check if existing)
        db_user = (User.query.get(user_profile.user_id)
                   or User(id=user_profile.user_id, username=username))
        db.session.add(db_user)

        # Get tweets ignoring re-tweets and replies
        tweets = list(get_tweets(username, pages=25))
        original_tweets = [
            tweet for tweet in tweets if tweet['username'] == username
        ]

        # # Add newest_tweet_id to the User table
        # if tweets:
        #     db_user.newest_tweet_id = tweets[0].id

        # Loop over tweets, get embedding and add to Tweet table
        for tweet in tweets:

            # Get an examble basilica embedding for first tweet
            embedding = BASILICA.embed_sentence(tweet['text'], model='twitter')

            # Add tweet info to Tweet table
            db_tweet = Tweet(id=tweet['tweetId'],
                             text=tweet['text'][:300],
                             embedding=embedding)
            db_user.tweet.append(db_tweet)
            db.session.add(db_tweet)

    except Exception as e:
        print('Error processing {}: {}'.format(username, e))
        raise e

    else:
        db.session.commit()
Ejemplo n.º 13
0
import sys
from twitter_scraper import Profile, get_tweets
from twitter_functions import process_tweets

try:
    opts = [opt for opt in sys.argv[1:] if opt.startswith("-")]
    args = [arg for arg in sys.argv[1:] if not arg.startswith("-")]
    profile_name = args[0]
    tweets = {}

    if "--profilename" or "-p" in opts:
        profile = Profile(profile_name)
        profile.to_dict()

        for tweet in get_tweets(profile_name, pages=1):
            if tweet['entries']['photos']:
                sep = ' pic'
                tweet_text = tweet['text'].split(sep, 1)[0]
                tweets.update([(tweet_text, tweet['entries']['photos'][0])])
        process_tweets(tweets)
except:
    raise SystemExit(f"Usage: {sys.argv[0]} -p twitter_handle")
Ejemplo n.º 14
0
def scrap_bio(username):
    profile = Profile(username)
    return profile.to_dict()['biography']
def profile():
    user = request.args.get('user', default='crypto_rand', type=str)
    return json.dumps(Profile(user).to_dict(), default=default)
Ejemplo n.º 16
0
def scrape_user_id(username: str) -> Optional[int]:
    try:
        return int(Profile(username).user_id)
    except IndexError as e:  # TODO: Rewrite Profile for better error handling.
        return None
Ejemplo n.º 17
0
from twitter_scraper import get_tweets, Profile
import twint
import sys
import io

if __name__ == '__main__':
    #for tweets in get_tweets('basadaplanta', pages=1):
    #    print(tweets['text'])
    #user = Profile('basadaplanta')
    #print(user.to_dict())


    #user = twint.Config()
    #user.Username = "******"

    #twint.run.Following(user)
    for tweets in get_tweets('#vegano'):
        user = Profile(tweets['username'])
        try:
            if user.following_count == user.followers_count or user.following_count >= user.followers_count:
                print(tweets['username'] + " followers_count: "+ str( user.followers_count ) + " following_count: " + str( user.following_count) )
        except :
            pass
    
  


    
Ejemplo n.º 18
0
def twitterToRSS(user):
    items = []  # Create place for items to go.
    profile = Profile(user)  # Get profile info for feed.

    for tweet in get_tweets(user, pages=1):
        # Title creation
        if len(tweet['text']) < 50:
            title = tweet['text']
        else:
            title = f"{tweet['text'][:50]}..."

        itemExtensions = []  # Create place for extensions to go

        if tweet['entries']['photos']:
            itemExtensions.append(
                MediaItem(
                    tweet['entries']['photos'][0],
                    'image/png',
                    'image',
                    True,
                ))
            description = f"<img class='webfeedsFeaturedVisual' src='{tweet['entries']['photos'][0]}'><p>{tweet['text']}</p>"
        else:
            itemExtensions.append(
                MediaItem(
                    profile.profile_photo,
                    'image/png',
                    'image',
                    True,
                ))
            description = tweet['text']

        if tweet['isRetweet']:
            author = f"@{tweet['username']}"
            title = f"(Retweet) {title}"
        else:
            author = f'{profile.name} (@{profile.username})'
        # An article for every tweet
        items.append(
            Item(title=title,
                 link=f"https://twitter.com{tweet['tweetUrl']}",
                 description=description,
                 author=author,
                 guid=Guid(f"https://twitter.com{tweet['tweetUrl']}"),
                 pubDate=tweet['time'],
                 extensions=itemExtensions))

    # Create feed
    feed = Feed(
        title=
        f"Tweets from {profile.name} (@{profile.username}) - Twitter RSS by Kyle Williams",
        link=f"https://twitter.com/{profile.username}",
        description=
        f"{profile.biography} [Tweets Gathered by Twitter Scraper [twitter-scraper], Feed Generated by rfeed, Hosted By Repl.it]",
        items=items,
        image=Image(profile.profile_photo,
                    f"Profile picture of {profile.name}",
                    f"https://twitter.com/{profile.username}"),
        extensions=[
            MediaContent(),
            Webfeeds(),
            WebfeedsIcon(profile.profile_photo),
            WebfeedsCover(get_banner(profile.username))
        ])

    # Tell Flask that it's getting XML
    response = make_response(feed.rss())
    response.headers.set('Content-Type', 'application/rss+xml')

    return response  # Return RSS feed