コード例 #1
0
def get_engagement(path_to_fav, path_to_timeline):
    """"""
    favorite_idx = load_favorites(path_to_fav).tweet.tolist()
    df_timeline = load_tweets(path_to_timeline, days=0)
    quoted = df_timeline[df_timeline.quoted_status == "N/A"].id.tolist()
    retweeted = df_timeline[df_timeline.retweeted_status == "N/A"].id.tolist()
    return list(set(quoted + retweeted + favorite_idx))
コード例 #2
0
def create_dataset_df(
    owner_id, auth, path_to_db, path_to_fav, path_to_timeline, muted_path
):
    """"""
    # TODO add logging
    df_tweets = load_tweets(db_path=path_to_db, days=0)
    df_tweets = df_tweets[df_tweets["lang"] == "en"]
    neg_list_idx = create_neg_list_idx(path_to_db, owner_id, auth, muted_path)
    pos_list_idx = get_engagement(path_to_fav, path_to_timeline)
    dataset_df = df_tweets[df_tweets["id"].isin(neg_list_idx + pos_list_idx)]
    dataset_df.loc[(dataset_df["id"].isin(neg_list_idx)), "labels"] = 0
    dataset_df.loc[(dataset_df["id"].isin(pos_list_idx)), "labels"] = 1
    return dataset_df
コード例 #3
0
def create_neg_list_idx(path_to_db, owner_id, auth_path, muted_path):
    """"""
    df_tweets = load_tweets(path_to_db, days=0)
    muted_acc_list = get_muted_acc(
        owner_id, auth_path, muted_lists=["nytblock", "muted"]
    )
    neg_list_idx = list(
        set(
            idx_contain_muted_words(df_tweets, muted_path)
            + from_muted_users_idx(df_tweets, muted_acc_list)
            + with_news_idx(df_tweets, muted_path)
            + get_not_rel_idx(owner_id, auth_path)
        )
    )
    return neg_list_idx
コード例 #4
0
def test_load_tweets():
    df = utils.load_tweets("data/test_tweets.db", days=0, latest=True)
    assert df.shape == (0, 10)
    df = utils.load_tweets("data/test_tweets.db", days=0, latest=False)
    assert df.shape == (18, 10)
コード例 #5
0
def test_df():
    df = utils.load_tweets("data/test_tweets.db", days=0, latest=False)
    return df
コード例 #6
0
def to_collection(
    auth: str = "config/auth.json",
    owner_id: str = "143058191",
    age: int = typer.Option(21, "--age", "-a"),
    reverse_age: bool = typer.Option(False, "--reverse", "-r"),
    nr_tweets: int = typer.Option(30, "--tweets", "-t"),
    ignore_lists: bool = typer.Option(False, "--ignore_lists", "-il"),
    users_from_list: str = typer.Option(None, "--users_from_list", "-fl"),
    friends: bool = typer.Option(False, "--friends_only", "-fo"),
    notfriends: bool = typer.Option(False, "--not_friends_only", "-nfo"),
    remove_liked: bool = typer.Option(False, "--ignore_lists", "-rl"),
    # dont_rem_news: bool = typer.Option(False, "--dont_remove_news", "-n"),
    # TODO above
    min_likes: int = typer.Option(0, "--min_likes", "-l"),
):
    """Grabs tweets from database, applies filters and transformations,
    and uploads them to collection.

    Args:
        auth (str): Path to your twitter credential. Defaults to "config/auth.json".
        owner_id (str): Owner of list and collections. Defaults to "143058191".
        age (int, optional): How old (in days) should be the most recent tweet. Defaults to 21.
        reverse_age (bool, optional): If chosed, no tweets older than age (in days) will be shown.
        nr_tweets (int): How many tweets upload to collection.
        ignore_lists (bool, optional): If `True` it prevents from using
            Twitter API and list functionality.
            This functionality causes most "Too Many Requests" errors.
        friends (bool, optional): If `True`, tweets by friends (following) will be added.
        notfriends (bool, optional): If `True`, tweets by
             non-friends (who user does not follow) will be added.
        remove_liked: If `True`, remove tweets from tweetfeed, that user already liked.
    """
    # TODO
    # use str from above:
    # check if valid
    # if not, get_collection_id

    custom_newsfeed = get_collection_id(
        owner_id=owner_id,
        auth_path=auth,
        collection_name="custom_newsfeed",
    )
    # clean up collection

    while count_collection(custom_newsfeed, auth) > 0:
        typer.echo("removing old tweets from collection ...")
        rem_from_collection(custom_newsfeed, auth)

    # load files
    with open("data/mute_list.txt", "r") as f:
        mute_list = json.loads(f.read())
    with open("data/mute_list_cs.txt", "r") as f:
        mute_list_cs = json.loads(f.read())
    with open("data/news_domains.txt", "r") as f:
        news_domains = json.loads(f.read())

    if not ignore_lists:
        mutedacc_rich = get_users_from_list(owner_id, auth, list_name="muted")
        nytblock = get_users_from_list(owner_id, auth, list_name="nytblock")
        # TODO idea - scrape https://www.politwoops.com/ for politician accounts
        # drop accounts that follow more than 15k people
        mutedacc_rich = nytblock + mutedacc_rich
        with open("data/mutedacc_rich.txt", "w") as write_file:
            json.dump(mutedacc_rich, write_file)

    # load tweets
    if reverse_age:
        df = load_tweets("data/home.db", days=age, latest=True)
    else:
        df = load_tweets("data/home.db", days=age)
    if not ignore_lists:
        mutedacc = [user["id"] for user in mutedacc_rich]
        df = filter_users(df, mutedacc)

    if friends:
        friends_idx = get_friends_ids(auth)
        df = filter_users(df, friends_idx, remove=False)
    if notfriends:
        friends_idx = get_friends_ids(auth)
        df = filter_users(df, friends_idx, remove=True)
    if users_from_list:
        list_acc = get_users_from_list(
            owner_id, auth, list_name=users_from_list
        )
        list_acc = [acc["id"] for acc in list_acc]
        df = filter_users(df, list_acc, remove=False)
    if remove_liked:
        favorite_df = load_favorites("data/faves.db")
        favorite_idx = favorite_df["tweet"].tolist()
        df = df[~df["id"].isin(favorite_idx)]

    # remove news and RT
    tweets_df = prep_batch(
        df=df,
        news_domains=news_domains,
        mute_list=mute_list,
        mute_list_cs=mute_list_cs,
        data_path="data",
        remove_news=True,
        likes=min_likes,
    )

    tweet_list = tweets_df["id"].tolist()[:nr_tweets]
    df = add_tweets_to_collection(
        custom_newsfeed, tweet_list, auth
    )  # adds to collection

    # backup old data
    try:
        seen_tweets_old = pd.read_csv("data/seen.csv")
    except FileNotFoundError:
        seen_tweets_old = pd.DataFrame(columns=["tweet_id", "err_reason"])
    seen_tweets_old.to_csv("data/seen_old.csv", index=False)

    # update seen.csv file
    df.to_csv("data/seen.csv", mode="a", header=False, index=False)

    not_relevant_list = get_tweets_from_collection(
        get_collection_id(owner_id, auth, "not_relevant"), auth
    )
    if len(not_relevant_list) > 180:
        # TODO if criteria matched
        # dump this to txt with date, and ZEROs collection
        # some other "model" function later can sum-up from
        # file and collection
        typer.echo("collection 'not_relevant' will hit max limit soon!")
        with open(
            f"{datetime.now():%Y_%m_%d_%H%M}_not_relevant_list.txt", "w"
        ) as f:
            f.write(json.dumps(not_relevant_list))