Example #1
0
def twintSearchRetweet(tweet_text):
    twint.output.tweets_list = []
    c = twint.Config()
    c.Store_object = True
    c.Hide_output = True
    # 公式RTのみ
    c.Native_retweets = True
    c.Limit = 4000
    c.Search = tweet_text
    twint.run.Search(c)
    tweets = twint.output.tweets_list
    for tweet in tweets:
        # ユーザー情報取得
        twint.output.users_list = []
        c = twint.Config()
        c.Store_object = True
        c.Hide_output = True
        c.Username = tweet.user_rt
        twint.run.Lookup(c)
        user = None
        icon_url = None
        bio = None
        following = None
        followers = None
        display_name = None
        if len(twint.output.users_list) > 0:
            user = twint.output.users_list[0]
            icon_url = user.avatar
            bio = user.bio
            following = user.following
            followers = user.followers
            display_name = user.name
        params = {
            'tweet_id_str':
            tweet.id_str,
            'user_id_str':
            tweet.user_id_str,
            'screen_name':
            tweet.username,
            'display_name':
            tweet.name,
            'tweeted_at':
            datetime.datetime.fromtimestamp(tweet.datetime / 1000.0),
            'rt_id_str':
            tweet.retweet_id,
            'rt_user_id_str':
            tweet.user_rt_id,
            'rt_icon_url':
            icon_url,
            'rt_user_bio':
            bio,
            'rt_following':
            following,
            'rt_followers':
            followers,
            'rt_screen_name':
            tweet.user_rt,
            'rt_display_name':
            display_name,
            'retweeted_at':
            datetime.datetime.fromtimestamp(
                ((int(tweet.retweet_id) >> 22) + 1288834974657) /
                1000.0).strftime("%Y-%m-%d %H:%M:%S")
        }
        setOrUpdateRetweets(params)
def get_replies(tweet_id=1169681044573962240,option="twint",interval=None,sources=True,testmode=False):
    if option=="twint":
        #get tweet date and time
        start_status=api.get_status(tweet_id)
        start_time=start_status.created_at
        author=start_status.user.screen_name
    
        profile=[]
        
        b = twint.Config()
        b.Username = author
        b.Since=str(start_time.date())
        b.Store_object=True
        b.Store_object_tweets_list = profile
        b.Hide_output=True
        twint.run.Profile(b)
        
        for entry in profile:
            if entry.id==tweet_id:
                print("found")
                print(vars(entry))
                con_id=entry.conversation_id
        
        if interval==None:
            T=datetime.timedelta(7,0,0,0,0,0)
            
        
        
        end_time=start_time+T
        
        #set up a search
        repl_all=[]
        query_replies="to:@"+author
        c = twint.Config()
        #c.Search = query_replies
        c.Replies=True
        c.To=author
        c.Since=str(start_time.date())
        c.Until=str(end_time.date())
        c.Store_object=True
        c.Store_object_tweets_list = repl_all
        if testmode:
            c.Limit=1
        c.User_full=True
        c.Hide_output=True

        twint.run.Search(c)
        
        A_trial=api.get_status(1170528218702471168)
        
        replies=[]
        for status in repl_all:
            if status.conversation_id==con_id:
                replies.append(status)

                
        status_list=replies
        if sources:
            status_list_s=sources_find(status_list,c,con_id)
        else:
            status_list_s=status_list
        
    return(status_list_s,c)            

def get_linktree_link(tweet):
    
    tweets = tweet.split(" ")
    for word in tweets:
        if word[ : len('https://linktr.ee/')] == 'https://linktr.ee/':
            return word
    return ""


if __name__ == '__main__':
	
	until, since = get_time()

	client = twint.Config()
	client.Search = 'linktr.ee'
	client.Pandas = True
	client.Since = since
	client.Until = until
	client.Hide_output = True
	
	objects = twint.run.Search(client)
	Tweets_df = twint.storage.panda.Tweets_df

	Tweets_df = Tweets_df.drop(['cashtags','search','translate', 'trans_src', 'trans_dest'], axis = 1)
	Tweets_df['linktree_link'] = Tweets_df.apply(lambda x: get_linktree_link(x['tweet']), result_type = 'expand', axis = 1)
	Tweets_df = Tweets_df[Tweets_df['linktree_link'] != ""]
	Tweets_df = Tweets_df.drop_duplicates(subset = 'linktree_link')

	filename = get_file_name(until)
Example #4
0
def download_account_tweets(username=None,
                            limit=None,
                            include_replies=False,
                            include_links=False,
                            strip_usertags=False,
                            strip_hashtags=False,
                            sentiment=0,
                            text_format="simple",
                            api=None,
                            w=None):
    """
    Download public Tweets from one Twitter account into a format suitable 
    for training with AI text generation tools.
    :param username: Twitter @ username to gather tweets or .txt file name
        with multiple usernames
    :param limit: # of tweets to gather; None for all tweets.
    :param include_replies: Whether to include replies to other tweets.
    :param include_links: Whether to include tweets with links.
    :param strip_usertags: Whether to remove user tags from the tweets.
    :param strip_hashtags: Whether to remove hashtags from the tweets.
    :param sentiment: Number of sentiment categories to include in text.
    :param text_format: Type of output format for the tweet.
    :param api: Open Twitter API reference
    :param w: Open file reference to write output
    """

    print("Retrieving tweets for @{}...".format(username))

    # Validate that it is a multiple of 40; set total number of tweets
    if limit:
        assert limit % 40 == 0, "`limit` must be a multiple of 40."

        pbar = tqdm(range(limit), desc="Oldest Tweet")

    # If no limit specifed, don't specify total number of tweet
    else:
        pbar = tqdm()

    # Create an empty file to store pagination id
    with open(".temp", "w", encoding="utf-8") as f:
        f.write(str(-1))

    # Set the loop's iterator
    i = 0
    # Iterate forever, and break based on reaching limit or no more tweets
    while (True):

        # If a limit is specified, break once it's reached
        if limit:
            if i >= (limit // 40): break

        # Create an empty list to store retrieved tweet objects
        tweet_data = []

        # twint may fail; give it up to 5 tries to return tweets
        for _ in range(0, 4):
            if len(tweet_data) == 0:
                c = twint.Config()
                c.Store_object = True
                c.Hide_output = True
                c.Username = username
                c.Limit = 40
                c.Resume = ".temp"

                c.Store_object_tweets_list = tweet_data

                twint.run.Search(c)

                # If it fails, sleep before retry.
                if len(tweet_data) == 0:
                    sleep(1.0)
            else:
                continue

        # If still no tweets after 5 tries, stop downloading tweets
        if len(tweet_data) == 0:
            break

        # Do not filter out replies
        if include_replies:

            for tweet in tweet_data:
                tweet_text = format_text(tweet, strip_usertags, strip_hashtags,
                                         sentiment, text_format, api)

                # Do not write tweet to file if the tweet_text is empty
                if tweet_text != "":
                    # Write tweet text to file
                    w.writerow([tweet_text])
        # Filter out replies
        else:

            for tweet in tweet_data:
                if not is_reply(tweet):
                    tweet_text = format_text(tweet, strip_usertags,
                                             strip_hashtags, sentiment,
                                             text_format, api)

                    # Do not write tweet to file if the tweet_text is empty
                    if tweet_text != "":
                        # Write tweet text to file
                        w.writerow([tweet_text])

        pbar.update(40)

        oldest_tweet = datetime.utcfromtimestamp(
            tweet_data[-1].datetime / 1000.0).strftime("%Y-%m-%d %H:%M:%S")
        pbar.set_description("Oldest Tweet: " + oldest_tweet)

        # Increase the loop's iterator
        i = i + 1

    pbar.close()
    os.remove(".temp")

    # Return 0
    return 0
def download_tweets(
    username=None,
    limit=None,
    include_replies=False,
    include_links=False,
    strip_usertags=False,
    strip_hashtags=False,
):
    """Download public Tweets from a given Twitter account
    into a format suitable for training with AI text generation tools.
    :param username: Twitter @ username to gather tweets.
    :param limit: # of tweets to gather; None for all tweets.
    :param include_replies: Whether to include replies to other tweets.
    :param strip_usertags: Whether to remove user tags from the tweets.
    :param strip_hashtags: Whether to remove hashtags from the tweets.
    :param include_links: Whether to include tweets with links.
    :return tweets: List of tweets from the Twitter account
    """

    # If a limit is specificed, validate that it is a multiple of 20
    if limit:
        assert limit % 20 == 0, "`limit` must be a multiple of 20."

    # If no limit specifed, estimate the total number of tweets from profile.
    else:
        c_lookup = twint.Config()
        c_lookup.Username = username
        c_lookup.Store_object = True
        c_lookup.Hide_output = True
        if include_links is True:
            c_lookup.Links = "include"
        else:
            c_lookup.Links = "exclude"

        twint.run.Lookup(c_lookup)
        limit = twint.output.users_list[-1].tweets

    pattern = r"http\S+|pic\.\S+|\xa0|…"

    if strip_usertags:
        pattern += r"|@[a-zA-Z0-9_]+"

    if strip_hashtags:
        pattern += r"|#[a-zA-Z0-9_]+"

    # Create an empty file to store pagination id
    with open(".temp", "w", encoding="utf-8") as f:
        f.write(str(-1))

    print("Retrieving tweets for @{}...".format(username))

    with open("{}_tweets.csv".format(username), "w", encoding="utf8") as f:
        w = csv.writer(f)
        w.writerow(["tweets"])  # gpt-2-simple expects a CSV header by default

        pbar = tqdm(range(limit), desc="Oldest Tweet")
        for i in range((limit // 20) - 1):
            tweet_data = []

            # twint may fail; give it up to 5 tries to return tweets
            for _ in range(0, 4):
                if len(tweet_data) == 0:
                    c = twint.Config()
                    c.Store_object = True
                    c.Hide_output = True
                    c.Username = username
                    c.Limit = 40
                    c.Resume = ".temp"

                    c.Store_object_tweets_list = tweet_data

                    twint.run.Search(c)

                    # If it fails, sleep before retry.
                    if len(tweet_data) == 0:
                        sleep(1.0)
                else:
                    continue

            # If still no tweets after multiple tries, we're done
            if len(tweet_data) == 0:
                c = twint.Config()
                c.Store_object = True
                c.Hide_output = True
                c.Username = username
                c.Limit = 40
                c.Resume = ".temp"

                c.Store_object_tweets_list = tweet_data

            if not include_replies:
                tweets = [
                    re.sub(pattern, "", tweet.tweet).strip()
                    for tweet in tweet_data if not is_reply(tweet)
                ]

                # On older tweets, if the cleaned tweet starts with an "@",
                # it is a de-facto reply.
                for tweet in tweets:
                    if tweet != "" and not tweet.startswith("@"):
                        w.writerow([tweet])
            else:
                tweets = [
                    re.sub(pattern, "", tweet.tweet).strip()
                    for tweet in tweet_data
                ]

                for tweet in tweets:
                    if tweet != "":
                        w.writerow([tweet])

            if i > 0:
                pbar.update(20)
            else:
                pbar.update(40)
            oldest_tweet = datetime.utcfromtimestamp(
                tweet_data[-1].datetime / 1000.0).strftime("%Y-%m-%d %H:%M:%S")
            pbar.set_description("Oldest Tweet: " + oldest_tweet)

    pbar.close()
    os.remove(".temp")
Example #6
0
def Search(
    query: Text,
    from_date: datetime.datetime = None,
    to_date: datetime.datetime = None,
    number_of_results: int = 100,
) -> pandas.DataFrame:
  """Search tweets.

  Args:
    query: the search query.
    from_date: search from this datetime.
    to_date: search till this datetime.
    number_of_results: number of results to return.

  Returns:
    A dataframe of tweets. For columns, reference:
      {
        'id': 1371248526085226496,
        'conversation_id': '1371248036563795969',
        'created_at': '2021-03-14 23:54:59 UTC',
        'date': '2021-03-14',
        'time': '23:54:59',
        'timezone': '+0000',
        'user_id': 1233956153656332291,
        'username': '******',
        'name': 'funy guy sbungbob',
        'place': '',
        'tweet': '@Zer0Priv And stock up on Bitcoin and GameStop stocks',
        'language': 'en',
        'mentions': [],
        'urls': [],
        'photos': [],
        'replies_count': 0,
        'retweets_count': 0,
        'likes_count': 2,
        'hashtags': [],
        'cashtags': [],
        'link': 'https://twitter.com/je4ia/status/1371248526085226496',
        'retweet': False,
        'quote_url': '',
        'video': 0,
        'thumbnail': '',
        'near': '',
        'geo': '',
        'source': '',
        'user_rt_id': '',
        'user_rt': '',
        'retweet_id': '',
        'reply_to': [{'screen_name': 'Zer0Priv',
          'name': 'Zer0',
          'id': '1256485417744031747'}],
        'retweet_date': '',
        'translate': '',
        'trans_src': '',
        'trans_dest': '',
      },
  """
  nest_asyncio.apply()

  c = twint.Config()
  c.Search = query
  if from_date:
    c.Since = from_date.strftime('%Y-%m-%d %H:%M:%S')
  if to_date:
    c.Until = to_date.strftime('%Y-%m-%d %H:%M:%S')
  c.Limit = number_of_results
  c.Pandas = True
  c.Hide_output = True
  twint.run.Search(c)
  
  return twint.storage.panda.Tweets_df
Example #7
0
def busqueda2(termino):
    clf = SentimentClassifier()  #agregue esto<----
    #print('-------')
    #print(clf.predict('Los perros son bonitos'))
    #print('-------')

    limite = 850
    asyncio.set_event_loop_policy(AnyThreadEventLoopPolicy())  #Soluciona un problema de threads
    c = twint.Config()
    c.Search = termino
    c.Lang = 'es'
    c.Popular_tweets = True
    c.Limit = limite
    
    c.Store_csv = True
    c.Output = "tweets.csv"

    print("inicio Search ("+termino+")>")
    #Para que 'twint.run.Search(c)' no imprima los valores por consola
    sys.stdout = open(os.devnull, "w") 
    twint.run.Search(c)
    sys.stdout = sys.__stdout__
    print("termino Search >")
    

    #Se lee solo la columna 'tweet' del archivo "tweets.csv"
    tweets = pd.read_csv("tweets.csv", sep=',', usecols=['tweet'], squeeze=True)




    data = pd.read_csv("tweets.csv", sep=',', usecols=['id','username', 'name', 'link', 'tweet', 'likes_count', 'date'], squeeze=True).values


    data_dicc = []
    data_dicc_neg = []
    data_dicc_pos = []
    data_dicc_neu = []


    text_ant = 'asd'
    i=0
    while i < len(data):
        dicc = {}
        if (text_ant != data[i][3]):
            text_ant = data[i][3]
            dicc['name'] = data[i][1]
            dicc['username'] = data[i][2]
            dicc['tweet'] = data[i][3]
            dicc['link'] = data[i][4]
            dicc['like'] = data[i][5]
            dicc['date'] = data[i][6]
            polaridad = clf.predict(tweets[i])
            dicc['polaridad'] = polaridad

            if polaridad <= 0.4:
                dicc['sentimiento'] = 'negativo'
                data_dicc_neg.append(dicc)
            elif polaridad <= 0.6:
                dicc['sentimiento'] = 'neutro'
                data_dicc_neu.append(dicc)
            else:
                dicc['sentimiento'] = 'positivo'
                data_dicc_pos.append(dicc)


            data_dicc.append(dicc)

        i = i + 1


    
    data_dicc = sorted(data_dicc, key = lambda i: i['like'], reverse=True)
    data = data_dicc
    data = data[0:50]

    data_dicc_neg = sorted(data_dicc_neg, key=lambda i: i['like'], reverse=True)
    data_neg = data_dicc_neg
    data_neg = data_neg[0:10]

    data_dicc_neu = sorted(data_dicc_neu, key=lambda i: i['like'], reverse=True)
    data_neu = data_dicc_neu
    data_neu = data_neu[0:10]

    data_dicc_pos = sorted(data_dicc_pos, key=lambda i: i['like'], reverse=True)
    data_pos = data_dicc_pos
    data_pos = data_pos[0:10]


    #Lista que contiene el texto de cada tweet obtenido
    listaT = tweets.values
    print(str(len(listaT))+" tweets ")
    #se elimina el archivo csv
    os.remove('tweets.csv')
    return listaT, data , data_neg, data_neu, data_pos
Example #8
0
 def __init__(self):
     self.c = twint.Config()
def get_replies(conversation_id, screen_name, created_at):
    # replies, likes, retweets
    replies = twint.Config()

    logging.info("screen_name {}, Created at {}, Conversation ID {}".format(screen_name, created_at, conversation_id))
    print("screen_name {}, Created at {}, Conversation ID {}".format(screen_name, created_at, conversation_id))
    replies.Retries_count = 2
    replies.Store_object = True
    replies.Store_object_tweets_list = []
    replies.Search = "(to:{})".format(screen_name)
    replies.Limit = 1000
    replies.Hide_output = True

    max_try = 2
    try_times = 0
    time_delta = 1
    df_list = []
    while try_times < max_try:
        time.sleep(1)
        if created_at:
            search_end = created_at + timedelta(time_delta)
            search_end_str = search_end.strftime("%Y-%m-%d")
            created_at_str = created_at.strftime("%Y-%m-%d")
            replies.Until = search_end_str
            replies.Since = created_at_str

        twint.run.Search(replies)
        df = pd.DataFrame([vars(i) for i in replies.Store_object_tweets_list])
        replies.search_tweet_list = []
        df = df.rename(columns={"figure-conversation-id":"conversation_id","date":"created_at","figure-item-id":"id"})
        df.drop_duplicates(inplace=True, subset=['id_str'])
        if len(df) == 0:
            time_delta = 2 * time_delta
            try_times += 1
            continue
        df['username'] = df['username'].apply(lambda x:x.replace("@",""))
        df['nreplies'] = df['replies_count']
        df['nretweets'] = df['retweets_count']

        return_replies_df = []
        print(len(df))
        if len(df) > 0:
            df['id'] = df['id'].apply(lambda x: int(x))
            return_replies_df = df[df['conversation_id'].apply(lambda x:str(x)==str(conversation_id)) ]
            # return_replies_df = df

            logging.info("There are {} replies for {}, {}".format(len(return_replies_df), conversation_id, screen_name))
        df_list.append(df)
        if len(return_replies_df) < 10:
            time_delta = 2 * time_delta
            try_times += 1
        else:
            break



    if len(df_list) == 0:
        return_replies_list = []
        unrelated_replies = []
    else:
        df = pd.concat(df_list)
        df.drop_duplicates(inplace=True, subset=['id'])
        df = df.astype({"id":"int64"})
        return_replies_df = df[df['conversation_id'].apply(lambda x:str(x)==str(conversation_id))]
        return_replies_list = return_replies_df.to_dict(orient="record")
        unrelated_replies = df.to_dict(orient="record")
        print("There are {} related tweets".format(len(return_replies_list)))


    return return_replies_list, unrelated_replies
    def __init__(self, resume_file):

        self.config = twint.Config()
        self.resume = resume_file + ".txt"
Example #11
0
def get_user_followers(username, search):
    c = twint.Config()
    c.Username = username
    save_result(c, username + "user_followers")
    twint.run.Followers(c)
    get_user_following(username, search)
Example #12
0
def _get_twint_config(limit):
    c = twint.Config()
    c.Limit = limit
    c.Hide_output = True
    return c
Example #13
0
def busqueda_por_fecha(termino, desde, hasta, hastaFinal, solo_busqueda, limite = 0):
    clf = SentimentClassifier()
    if solo_busqueda == False:
        dAño, dMes, dDia = desde.split('/')
        hAño, hMes, hDia = hasta.split('/')

        inicio = date(int(dAño), int(dMes), int(dDia))
        final = date(int(hAño), int(hMes), int(hDia))
        delta = timedelta(days=1)

        total = 1000 #limite (hay que establecer un mínimo o se demoraría demasiado)

        diasTotales = final - inicio
        if diasTotales.days > 190:
            print("diasTotales("+diasTotales+") > 190")
            return 0

        if diasTotales.days > 50:
            intervalo = (diasTotales.days-50)/44.33            
        else:
            intervalo = 0

        limite = (total*(intervalo+1)/diasTotales.days)*1000
        deltaIntervalo = timedelta(days=round(intervalo))
        actual = inicio

        print("limite: "+str(limite)+"| deltaIntervalo: "+str(deltaIntervalo))

        while actual < final:
            busqueda_por_fecha(termino, actual, actual + delta, hastaFinal, True, limite)
            actual += deltaIntervalo + delta
        
        if actual == final:
            actual -= delta
            busqueda_por_fecha(termino, actual, actual + delta, hastaFinal, True, limite)
        
    elif solo_busqueda == True:

        asyncio.set_event_loop_policy(AnyThreadEventLoopPolicy())
        c = twint.Config()
        c.Search = termino
        c.Lang = 'es'
        c.Popular_tweets = True
        c.Limit = limite

        c.Store_csv = True
        c.Output = "tweets_fecha.csv"
        
        c.Until = str(hasta)
        c.Since = str(desde)

        sys.stdout = open(os.devnull, "w") 
        twint.run.Search(c)
        sys.stdout = sys.__stdout__

        print(str(desde)+" > "+str(hasta))
        return 0
    
    tweetsFecha = pd.read_csv("tweets_fecha.csv", sep=',', usecols=['tweet'], squeeze=True)
    listaTFecha = tweetsFecha.values

    Fechas = pd.read_csv("tweets_fecha.csv", sep=',', usecols=['date'], squeeze=True)
    listaFechas = Fechas.values



    data = pd.read_csv("tweets_fecha.csv", sep=',',
                       usecols=['id', 'username', 'name', 'link', 'tweet', 'likes_count', 'date'], squeeze=True).values

    data_dicc = []
    data_dicc_neg = []
    data_dicc_pos = []
    data_dicc_neu = []

    text_ant = 'asd'
    i = 0
    while i < len(data):
        dicc = {}
        if (text_ant != data[i][3]):
            text_ant = data[i][3]
            dicc['name'] = data[i][1]
            dicc['username'] = data[i][2]
            dicc['tweet'] = data[i][3]
            dicc['link'] = data[i][4]
            dicc['like'] = data[i][5]
            dicc['date'] = data[i][6]
            polaridad = clf.predict(tweetsFecha[i])
            dicc['polaridad'] = polaridad

            if polaridad <= 0.4:
                dicc['sentimiento'] = 'negativo'
                data_dicc_neg.append(dicc)
            elif polaridad <= 0.6:
                dicc['sentimiento'] = 'neutro'
                data_dicc_neu.append(dicc)
            else:
                dicc['sentimiento'] = 'positivo'
                data_dicc_pos.append(dicc)

            data_dicc.append(dicc)

        i = i + 1

    data_dicc = sorted(data_dicc, key=lambda i: i['like'], reverse=True)
    data = data_dicc
    data = data[0:50]

    data_dicc_neg = sorted(data_dicc_neg, key=lambda i: i['like'], reverse=True)
    data_neg = data_dicc_neg
    data_neg = data_neg[0:10]

    data_dicc_neu = sorted(data_dicc_neu, key=lambda i: i['like'], reverse=True)
    data_neu = data_dicc_neu
    data_neu = data_neu[0:10]

    data_dicc_pos = sorted(data_dicc_pos, key=lambda i: i['like'], reverse=True)
    data_pos = data_dicc_pos
    data_pos = data_pos[0:10]



    


    os.remove('tweets_fecha.csv')
    print(str(len(listaTFecha))+" tweets ")
    return [listaTFecha, listaFechas, data , data_neg, data_neu, data_pos]
Example #14
0
def scraping():
    import twint

    c = twint.Config()

    c.Search = input("Enter sentence::")
    c.Limit = 100
    c.Email = True

    c.Store_csv = True
    c.Output = "none"
    twint.run.Search(c)

    import pandas as pd

    df = pd.read_csv("none/tweets.csv")

    print(df.columns)
    # question.isnull().any()
    # question.columns
    import datetime as dt
    import nltk
    start = dt.datetime.now()
    sno = nltk.stem.SnowballStemmer("english")
    i = 0
    str1 = ""
    final_string = []
    all_positive_words = []
    all_negative_words = []
    s = ""
    from nltk.corpus import stopwords
    stop = set(stopwords.words('english'))
    excluding = [
        'against', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn',
        "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't",
        'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn',
        "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shouldn',
        "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
        'wouldn', "wouldn't"
    ]
    stop = [words for words in stop if words not in excluding]

    import re

    def cleanhtml(sentence):
        cleanr = re.compile('<.*>')
        cleantext = re.sub(cleanr, '', sentence)
        return cleantext

    def cleanpunc(sentence):
        cleaned = re.sub(r'[?|!|\'|"|#]', r' ',
                         sentence)  #seee \' and combination
        cleaned = re.sub(r'[.|,|)|/|\|(]', r' ', cleaned)
        return cleaned

    print(sno.stem("tasty"))  #checking the rootword of tasty

    print(sno)

    def preprocess1(X):
        final_string = []

        for sent in X:
            filterd_sentence = []

            sent = str(sent)

            sent = cleanhtml(sent)
            #         print(sent)
            for w in sent.split():
                for cleaned_words in cleanpunc(w).split():
                    if ((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                        if (cleaned_words.lower() not in stop):
                            s = (sno.stem(
                                cleaned_words.lower())).encode("utf8")
                            filterd_sentence.append(s)

                        else:
                            continue
                    else:
                        continue

            str1 = b" ".join(filterd_sentence)
            final_string.append(str1)

    #         i=i+1
        return final_string

    def preprocess(X):
        final_string = []
        X = [X]

        for sent in X:
            filterd_sentence = []

            sent = str(sent)

            sent = cleanhtml(sent)
            #         print(sent)
            for w in sent.split():
                for cleaned_words in cleanpunc(w).split():
                    if ((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                        if (cleaned_words.lower() not in stop):
                            s = (sno.stem(
                                cleaned_words.lower())).encode("utf8")
                            filterd_sentence.append(s)

                        else:
                            continue
                    else:
                        continue

            str1 = b" ".join(filterd_sentence)
            final_string.append(str1)

    #         i=i+1
        return final_string

    import pickle
    from keras.preprocessing import sequence

    def find_word_index(row, word_index_dict):
        holder = []
        for word in row.split():
            if word in word_index_dict:
                holder.append(word_index_dict[word])
            else:
                holder.append(0)
        return holder

    def predict_lstm(x):
        x = preprocess1(x)
        #     print(x)
        #     with open('model_lstm.pkl', 'rb') as f:
        #         model = pickle.load(f)

        #     print(model.predict[x])
        with open('label_transform.pkl', 'rb') as f:
            encoder = pickle.load(f)

        from keras.models import load_model
        model = load_model('LSTM_1.ckpt')

        with open('word_index_dict.pkl', 'rb') as f:
            word_index_dict = pickle.load(f)

        text = []
        for sent in x:
            text.append(find_word_index(sent, word_index_dict))

        #print(len(x),len(text))
        x = sequence.pad_sequences(text, maxlen=500)

        pred_prob = model.predict(x)

        #     print(pred_prob)
        #     print(encoder.classes_)

        sentiment = []

        #     print(encoder.inverse_transform(pred_prob))

        preds = encoder.inverse_transform(pred_prob)

        for pre in preds:
            if pre == 1:
                sentiment.append("Strongly Negative")
            if pre == 2:
                sentiment.append("Weekly Negative")
            if pre == 3:
                sentiment.append("Neutral")
            if pre == 4:
                sentiment.append("Weekly Positive")
            if pre == 5:
                sentiment.append("Strongly Positive")

    #     print("output::",pred)

        return [sentiment, pred_prob]

    df['cleaned'] = df.tweet.apply(lambda x: preprocess(x))

    # df.head()
    q = predict_lstm(list(df.cleaned))
    df['sentiment'] = q[0]

    import numpy as np

    df['confidence'] = q[1].tolist()

    df.drop('cleaned', axis=1, inplace=True)
    df = df.astype(str)

    l = []
    data = df.copy()
    for i in data.index:
        if data['sentiment'][i] == 'Neutral':
            l.append(3)
        elif data['sentiment'][i] == 'Strongly Positive':
            l.append(5)
        elif data['sentiment'][i] == 'Weekly Positive':
            l.append(4)
        elif data['sentiment'][i] == 'Strongly Negative':
            l.append(1)
        elif data['sentiment'][i] == 'Weekly Negative':
            l.append(2)
    l = pd.DataFrame(l)
    l.columns = ['label']
    data = pd.concat([data, l], axis=1)

    data = data[['username', 'tweet', 'sentiment', 'label']]

    data.to_csv("dezzex1.csv")
Example #15
0
def get_top_mentions_hashtags_geo(lat_long, radius, limit):
    os.chdir("Python_Scripts")
    currentDir = os.getcwd() + "/result/twitter/"
    try:
        os.mkdir(currentDir)
    except:
        pass
    os.chdir(currentDir)
    twint.output.tweets_list = []
    c = twint.Config()
    c.Hide_output = True  # hides command line verbose output
    c.Limit = 500  # maximum number of tweets to pull
    c.Geo = f"{lat_long},{radius}"
    c.Store_object = True
    
    currentDir = os.getcwd()+"/Python_Scripts/result/twitter/"
    # sys.path.append("/app/Python_Scripts/Python_Scripts/result/twitter/")
    # os.chdir(currentDir)
    c.Store_csv = True
    c.Output = f"{lat_long}-tweets.csv"
    twint.run.Search(c)
    tweets = twint.output.tweets_list
    mentions_dict = {}
    hashtags_dict = {}
    for tweet in tweets:
        for mention in tweet.mentions:
            if mention in mentions_dict:
                mentions_dict[mention] += 1
            else:
                mentions_dict[mention] = 1
        for hashtag in tweet.hashtags:
            if hashtag in hashtags_dict:
                hashtags_dict[hashtag] += 1
            else:
                hashtags_dict[hashtag] = 1
    top_mentions = heapq.nlargest(10, mentions_dict, key=mentions_dict.get)  # gets highest mentions
    top_hashtags = heapq.nlargest(10, hashtags_dict, key=hashtags_dict.get)  # gets highest hashtags

    # makes dictionary of just highest ones
    mentions_ranked = {}
    hashtags_ranked = {}
    for mention in top_mentions:
        mentions_ranked[mention] = mentions_dict[mention]
    for hashtag in top_hashtags:
        hashtags_ranked[hashtag] = hashtags_dict[hashtag]
    plt.barh(range(len(mentions_ranked)), list(mentions_ranked.values()), align='center', color='maroon')
    plt.yticks(range(len(mentions_ranked)), list(mentions_ranked.keys()))
    plt.gca().invert_yaxis()  # just to have the highest bar at the top
    plt.title("Top 10 Trending Mentions from the Geo-location: " + lat_long)
    
    plt.savefig(os.getcwd()+"/Python_Scripts/result/twitter/"+ lat_long + '-mentions.png', bbox_inches='tight')  # saves the visualization as png
    # plt.savefig(seed_hashtag + '.pdf', bbox_inches='tight')
    plt.barh(range(len(hashtags_ranked)), list(hashtags_ranked.values()), align='center', color='maroon')
    plt.yticks(range(len(hashtags_ranked)), list(hashtags_ranked.keys()))
    plt.gca().invert_yaxis()  # just to have the highest bar at the top
    plt.title("Top 10 Trending Hashtags from the Geo-location:" + lat_long)
    # os.chdir(currentDir)
    plt.savefig("/app/Python_Scripts/result/twitter/"+  lat_long + '-hashtags.png', bbox_inches='tight')  # saves the visualization as png
    # plt.savefig(seed_hashtag + '.pdf', bbox_inches='tight')
    #print("List of Top 10 mentions " + lat_long + " :")
    #print(top_mentions)  # displays the top 10 hashtags as a list.
    #print("List of Top 10 hashtags " + lat_long + " :")
    #print(top_hashtags)  # displays the top 15 hashtags as a list.
    plt.close()
    exit()  
Example #16
0
def scrape(username, limit=None):

    if not os.path.exists(imagefolder):
        os.makedirs(imagefolder)
    if not os.path.exists(datafolder):
        os.makedirs(datafolder)

    # Configure
    c = twint.Config()
    c.Username = username
    c.Proxy_host = getProxy()
    c.Proxy_port = 6060
    c.Proxy_type = "http"
    c.Proxy_Username = proxyuser
    c.Proxy_Password = proxypass
    c.Media = True
    c.Pandas = True
    c.Hide_output = True
    c.Limit = limit

    # Run
    print("Scraping from " + username + "'s twitter...")

    twint.run.Search(c)

    print("Scraped!")

    df = twint.storage.panda.Tweets_df

    username = df["username"][0]
    # userid = df["user_id"][0]

    tweets = []

    for index, row in df.iterrows():
        # photos = []
        # for photo in row["photos"]:

        #     url = photo.split("/")[-1]

        #     if index <= 5:
        #         p = Process(target=downloadAndSavePhoto, args=(url,))
        #         p.start()

        #     photos.append(baseurl+imagefolder+url)

        # tweets.append({"id": str(row["id"]), "created_at": str(
        #     row["created_at"]), "tweet": row["tweet"], "photos": photos})

        if row["video"] == 0:
            embed = requests.get(
                f"https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{row['id']}"
            )
            embed = embed.json()["html"]
        else:
            embed = ""

        tweets.append({
            "id": str(row["id"]),
            "created_at": str(row["created_at"]),
            "tweet": row["tweet"],
            "photos": row["photos"],
            "video": embed
        })

    print("Converted dataframe!")
    return {"username": str(username), "tweets": tweets}
Example #17
0
def get_user_following(username,search):
    c = twint.Config()
    c.Username = username
    save_result(c,username + "user_following")
    twint.run.Following(c)
    get_user_tweets(username,search,True)
def download_tweets(username=None,
                    limit=None,
                    include_replies=False,
                    strip_usertags=False,
                    strip_hashtags=False):
    """Download public Tweets from a given Twitter account
    into a format suitable for training with AI text generation tools.
    :param username: Twitter @ username to gather tweets.
    :param limit: # of tweets to gather; None for all tweets.
    :param include_replies: Whether to include replies to other tweets.
    :param strip_usertags: Whether to remove user tags from the tweets.
    :param strip_hashtags: Whether to remove hashtags from the tweets.
    """

    assert username, "You must specify a username to download tweets from."
    if limit:
        assert limit % 20 == 0, "`limit` must be a multiple of 20."

    # If no limit specifed, estimate the total number of tweets from profile.
    else:
        c_lookup = twint.Config()
        c_lookup.Username = username
        c_lookup.Store_object = True
        c_lookup.Hide_output = True

        twint.run.Lookup(c_lookup)
        limit = twint.output.users_list[0].tweets

    pattern = r'http\S+|pic\.\S+|\xa0|…'

    if strip_usertags:
        pattern += r'|@[a-zA-Z0-9_]+'

    if strip_hashtags:
        pattern += r'|#[a-zA-Z0-9_]+'

    update_resume_file(-1)

    print("Retrieving tweets for @{}...".format(username))

    with open('{}_tweets.csv'.format(username), 'w', encoding='utf8') as f:
        w = csv.writer(f)
        w.writerow(['tweets'])  # gpt-2-simple expects a CSV header by default

        pbar = tqdm(range(limit), desc="Oldest Tweet")
        for i in range((limit // 20) - 1):
            tweet_data = []

            # twint may fail; give it up to 5 tries to return tweets
            for _ in range(0, 4):
                if len(tweet_data) == 0:
                    c = twint.Config()
                    c.Store_object = True
                    c.Hide_output = True
                    c.Username = username
                    c.Limit = 40
                    c.Resume = '.temp'

                    c.Store_object_tweets_list = tweet_data

                    twint.run.Search(c)

                    # If it fails, sleep before retry.
                    if len(tweet_data) == 0:
                        sleep(1.0)
                else:
                    continue

            # If still no tweets after multiple tries, we're done
            if len(tweet_data) == 0:
                break

            if i > 0:
                tweet_data = tweet_data[20:]

            if not include_replies:
                tweets = [
                    re.sub(pattern, '', tweet.tweet).strip()
                    for tweet in tweet_data if not is_reply(tweet)
                ]
            else:
                tweets = [
                    re.sub(pattern, '', tweet.tweet).strip()
                    for tweet in tweet_data
                ]

            for tweet in tweets:
                if tweet != '':
                    w.writerow([tweet])

            if i > 0:
                pbar.update(20)
            else:
                pbar.update(40)
            oldest_tweet = (datetime.utcfromtimestamp(
                tweet_data[-1].datetime /
                1000.0).strftime('%Y-%m-%d %H:%M:%S'))
            pbar.set_description("Oldest Tweet: " + oldest_tweet)

    pbar.close()
    os.remove('.temp')
Example #19
0
 def __init__(self, args):
     self.config = twint.Config()
     self.name_of_hashtag = args.hashtag
     self.config.Limit = 100000000000000000
     self.basic_setup_status = self.basicSetup()
Example #20
0
import twint

c = twint.Config()
a = twint.Config()
b = twint.Config()

c.Search = "Amazon forest"
c.Lang = 'en'
c.Limit = 10000
c.Since = '2019-01-01'
c.Store_csv = True
c.Output = "Amazon forest"
c.Hide_output = True

twint.run.Search(c)

a.Search = "Amazon forest fire"
a.Lang = 'en'
a.Limit = 10000
a.Store_csv = True
a.Since = '2019-01-01'
a.Output = " Amazon forest fire"
a.Hide_output = True
twint.run.Search(a)

b.Search = "Amazon forest", ' fire', "climate change"
b.Lang = 'en'
b.Limit = 10000
b.Since = '2019-01-01'
b.Store_csv = True
b.Output = " climate change"
today = datetime.today().strftime('%Y-%m-%d')
date_list = pd.date_range(start="2019-08-02",end='2019-08-11')
date_list = date_list.to_series().dt.date



#%% filter by country
#how long going to sleep after scraping one country done

#how long going to sleep after one day has been scraped
sleep_day = 1
number_tweets = 20000

for date in date_list:
    time1 = time.time()
    config = twint.Config() 
    
    date1 = date
    date2 = date - pd.Timedelta(days = 1)
    
    language = "en"
    
    config.Search = f"until:{date1} since:{date2} lang:{language}"
    config.Store_object = True 
    
    
    
    #c.Store_csv = True
    config.Limit = number_tweets #number needed to get around 1mio posts per country
    config.Store_json = True
    config.Output = f'En_NoFilter_{date2}_{language}.json'
import twint
from datetime import datetime
import time
import os

data = datetime.now()
timestr = time.strftime("%Y%m%d")

print('-' * 70)
print('-Script para raspagem de dados do Twitter-\n'
      'Defina os parâmetros para sua busca.\n'
      'A saída gera um relatório da busca e um arquivo CSV com os dados.\n')
print('-' * 70)

parametro = input('Busca por termo (1) ou por usuário (2)? ')
c = twint.Config()  #configurar os parâmatros de busca do twint
if parametro == '1':
    busca = input('Digite o termo da busca: ')
    c.Search = "'" + busca + "'"
    c.Username = None
    nome = busca + '_' + timestr
    c.Output = os.path.join('DATA', nome)  #pasta de saída
    if not os.path.exists(c.Output):  #se a pasta não existe, cria a pasta DATA
        os.makedirs(c.Output)

elif parametro == '2':
    busca = input('Digite o nome do usuário: ')
    c.Search = None
    c.Username = busca
    nome = 'user_' + c.Username + '_' + timestr
    c.Output = os.path.join('DATA', nome)
Example #23
0
 def get_all_retweets(self, username):
     c = twint.Config()
     c.Username = username
     c.Retweets = True
     twint.run.Profile(c)
    def get_votes_and_profile_image(self, tweet, full_thread_text=None, title=None, doi=None, pubmed_id=None, pmcid=None, return_votes=True):
        """
        Get profile image of tweeter and compute votes of tweet.
        """
        # Inspired by https://github.com/karpathy/arxiv-sanity-preserver
        def tprepro(tweet_text):
            # take tweet, return set of words
            t = tweet_text.lower()
            t = re.sub(r'[^\w\s@]','',t) # remove punctuation
            ws = set([w for w in t.split() if not (w.startswith('#') or w.startswith('@'))])
            return ws

        # Lookup the profile of the user
        users_list = []
        c = twint.Config()
        c.Username = tweet.username
        c.Store_object = True
        c.Store_object_users_list = users_list
        twint.run.Lookup(c)

        # Get number of followers and profile image url
        try:
            num_followers = users_list[0].followers
            profile_image_url = users_list[0].avatar
            bio = users_list[0].bio
        except IndexError:
            num_followers = 0
            profile_image_url = ""
            bio = ""

        if return_votes == False:
            return None, profile_image_url

        # Give low weight to retweets, tweets without comments and tweets with short length
        thread_words = set()
        if full_thread_text:
            for part in full_thread_text:
                thread_words = thread_words | tprepro(part)
        else:
            thread_words = thread_words | tprepro(tweet.tweet)

        query_words = set()
        for identifier in [title, doi, pubmed_id, pmcid]:
            if identifier is not None:
                query_words = query_words | tprepro(identifier)

        for url in tweet.urls:
            query_words = query_words | tprepro(url)

        comments = thread_words - query_words
        isok = int(not(tweet.retweet or len(tweet.tweet) < 40) and len(comments) >= 5)
        tweet_sort_bonus = 10000 if isok else 0

        research_bonus = 0
        # If bio contains keywords such as research/professor, give additional points
        if re.search(r'.*researcher.*', bio, re.IGNORECASE) or re.search(r'.*professor.*', bio, re.IGNORECASE) or re.search(r'.*phd.*', bio, re.IGNORECASE) or re.search(r'.*postdoc.*', bio, re.IGNORECASE) or re.search(r'.*scientist.*', bio, re.IGNORECASE):
            research_bonus += 500

        # Add up all contributing factors
        votes = int(tweet.likes_count) + int(tweet.retweets_count) + tweet_sort_bonus + num_followers + research_bonus

        return votes, profile_image_url
def get_retweets(tweet_id=1169681044573962240,option="twint",interval=None,word_limit=100,word_limit_low=7):
    
    search_objects_record=[]
    
    print("analysing tweet "+str(tweet_id))
    #split the tweet into sentences. Then search for them, trying to find the original tweet.   
    start_status=api.get_status(tweet_id,tweet_mode="extended")
    start_time=start_status.created_at
    author=start_status.user.screen_name
    print("author "+str(author))
    
    
    
    
    text=url_remove(start_status.full_text)
    #split into sentences
    sentences=text.split(".")
    if len(sentences[-1])==0:
        sentences=sentences[:-1]
    copies_found=[]
    for sentence in sentences:
        if len(sentence)==0:
            continue
        if sentence[0]==' ':
            sentence=sentence[1:]
        
        
        

        
        
        #print(length_w)
        
    #now build a search - this won't work for twint with longer queries, won't work for tweepy for >7 days old posts
        if option=="twint":
            sent_matches=[]
            S=sentence.split(" ")
            length_w=len(S)
            if length_w>100:
                query=" ".join(S[2:10])
            #    query="\""+query+"\""
            else:
                query=sentence
                
            if length_w<word_limit_low:
                continue
            #    query="\""+query+"\""           
            query=query+" filter:nativeretweets"
            #query='Netanyahu West Bank Palestinian'
            #query=query+" (@"+author+")"
            bs=twint.Config()
            bs.Search=query
            print("searching "+bs.Search)
            bs.Store_object=True
            bs.Filter_retweets=False
            bs.Hide_output=True
            bs.Store_object_tweets_list = sent_matches
            #bs.Output=False
            twint.run.Search(bs)
            
            
            search_objects_record.append(bs)
            #add filter:nativeretweets for retweets
            
        if option=="tweepy":
            copies_found = [status for status in tweepy.Cursor(api.search, q=sentence, tweet_mode='extended').items(max_tweets)]

        
        copies_found.append(sent_matches)
        
    for copy in copies_found:
        print("Found "+str(len(copy))+" tweets")    
    return(copies_found,search_objects_record)
Example #26
0
def twintScraper(from_date=None, end_date=None):

    # Configure
    c = twint.Config()
    c.Username = "******"
    # c.Search = "coronavirus"
    c.Limit = 10

    # c.Tweet_id = "1257793742540386304"
    c.Show_hashtags = True
    c.Get_replies = True
    c.Verified = True
    c.Stats = True
    c.Count = True
    c.Lang = "en"
    c.Hide_output = True
    # c.Resume = "1223026504482918405"
    # print(get_retweeters_list("1258837711806496770"))
    # twint.run.Profile(c)
    c.Store_object = True
    # c.Since = "2020-01-30 00:00:00"
    # c.Until ="2020-02-01 00:00:00"
    c.Since = from_date
    c.Until = end_date
    # c.Until =str(datetime.datetime.now())[:19]
    # print(str(datetime.datetime.now())[:19])
    # exit()
    twint.run.Search(c)

    tweets_as_objects = twint.output.tweets_list
    print("Real tweets amount: ", len(tweets_as_objects))
    dict_op = {
        "CONTENT": [],
        "TWEET_ID": [],
        "USER_NAME": [],
        "POST_DATE": [],
        "POST_TIME": [],
        "LINK": [],
        "URL_INCLUDED": [],
        "RETWEETS_COUNT": [],
        "RETWEETS_PEOPLE": [],
        "LIKES_AMOUNT": [],
        "REPLIIES_AMOUNT": [],
        "REPLAY_PEOPLE": [],
        "REPLAY_TIME": [],
        "REPLAY_CONTENT": [],
        "MARK": []
    }
    count = 0
    for tweet in tweets_as_objects:
        id = tweet.id
        name = tweet.username
        # print(name,"HHHHHHHHHHHHHHHHHHHHHHHH")
        likes_amount = tweet.likes_count
        retweets_count = tweet.retweets_count
        replies_count = tweet.replies_count
        replies_people = []

        replies_time = []
        replies_content = []
        replies_people, replies_time, replies_content = getReplyer(name, id)
        print(count, " CONTENT: ", tweet.tweet, " TWEET_ID: ", str(id),
              " USER_NAME: ", str(name), " POST_DATE: ", tweet.datestamp,
              " POST_TIME: ", tweet.timestamp, " LINK: ", tweet.link,
              " URL_INCLUDED: ", tweet.urls, " RETWEETS_COUNT: ",
              len(get_retweeters_list(id)), " RETWEETS_PEOPLE: ",
              get_retweeters_list(id),
              " LIKES_AMOUNT: ", likes_amount, " REPLIIES_AMOUNT: ",
              len(replies_people), " REPLAY_PEOPLE: ", replies_people,
              " REPLAY_TIME: ", replies_time, " REPLAY_CONTENT: ",
              replies_content)
        dict_op["CONTENT"].append(tweet.tweet)
        dict_op["TWEET_ID"].append(str(id))
        dict_op["USER_NAME"].append(str(name))
        dict_op["POST_DATE"].append(tweet.datestamp)
        dict_op["POST_TIME"].append(tweet.timestamp)
        dict_op["LINK"].append(tweet.link)
        dict_op["URL_INCLUDED"].append(tweet.urls)
        dict_op["RETWEETS_COUNT"].append(len(get_retweeters_list(id)))
        dict_op["RETWEETS_PEOPLE"].append(get_retweeters_list(id))
        dict_op["LIKES_AMOUNT"].append(likes_amount)
        dict_op["REPLIIES_AMOUNT"].append(len(replies_people))
        dict_op["REPLAY_PEOPLE"].append(replies_people)
        dict_op["REPLAY_TIME"].append(replies_time)
        dict_op["REPLAY_CONTENT"].append(replies_content)
        dict_op["MARK"].append("##############################")
        count += 1
        if (count % 200 == 0
                and count != 0) or count == len(tweets_as_objects):
            lastsavedtweetid = dict_op["TWEET_ID"][len(dict_op["TWEET_ID"]) -
                                                   1]
            print(f"SAVE_MARK {count}: lasts aved tweetid = ",
                  lastsavedtweetid)
            df = pd.DataFrame(data=dict_op)
            df.to_json(f"{from_date} {count} COVID-19.json", orient='records')
            dict_op = {
                "CONTENT": [],
                "TWEET_ID": [],
                "USER_NAME": [],
                "POST_DATE": [],
                "POST_TIME": [],
                "LINK": [],
                "URL_INCLUDED": [],
                "RETWEETS_COUNT": [],
                "RETWEETS_PEOPLE": [],
                "LIKES_AMOUNT": [],
                "REPLIIES_AMOUNT": [],
                "REPLAY_PEOPLE": [],
                "REPLAY_TIME": [],
                "REPLAY_CONTENT": [],
                "MARK": []
            }
        if count % 1000 == 0 and count != 0:
            time.sleep(60.0)
Example #27
0
import twint
import sys
import os

c = twint.Config()

c.Username = sys.argv[1]
c.Limit = 20
c.Output = "tweets.txt"

if os.path.exists("tweets.txt"):
    os.remove("tweets.txt")

twint.run.Search(c)
Example #28
0
def reply_influence(user, filename=os.path.join(args.file_path, args.reply_filename)):
    filename = filename.format(user)

    original_post_conf = twint.Config()
    original_post_conf.Backoff_exponent = 2
    original_post_conf.Retries_count = 40
    original_post_conf.Username = user
    original_post_conf.Search = args.keyword
    original_post_conf.Lang = args.lang
    original_post_conf.Pandas = True
    # original_post_conf.Proxy_host = 'tor'

    reply_conf = twint.Config()
    reply_conf.To = user
    reply_conf.Filter_retweets = True
    reply_conf.Backoff_exponent = 2
    reply_conf.Retries_count = 40
    reply_conf.Hide_output = True
    reply_conf.Lang = args.lang
    reply_conf.Pandas = True
    # reply_conf.Resume = 'temp.txt'
    # reply_conf.Proxy_host = 'tor'

    str_start_date, id_list = continue_date(user, filename)
    for begindate, enddate in daterange(str_start_date, args.until, args.date_format):
        print('----------------------------------------------')
        print(f'Advanced search : "{args.keyword}" on {begindate}.')
        start = time.time()

        original_post_conf.Search = args.keyword
        original_post_conf.Since = begindate
        original_post_conf.Until = enddate

        while True:
            try:
                twint.run.Search(original_post_conf)
                break
            except Exception as e:
                print(e)
        
        print(f'Search time: {time.time() - start} s.')

        pd_tweets = twint.storage.panda.Tweets_df
        if len(pd_tweets) != 0:
            id_list.extend(pd_tweets['id'].tolist())

        print('id_list: ', len(id_list))
        print(id_list)

        reply_list = []
        reply_count_list = []
        unique_user_count = []
        user_weight_list = []
        influence_score_list = []
        if len(id_list) != 0:
            print('----------------------------------------------')
            print(f'Searching reply posts to {user}.')
            start = time.time()

            reply_conf.Since = begindate
            reply_conf.Until = enddate

            while True:
                try:
                    twint.run.Search(reply_conf)
                    break
                except Exception as e:
                    print(e)
                    time.sleep(5)
            print(f'Search time: {time.time() - start} s.')

            pd_all_reply_tweets = twint.storage.panda.Tweets_df
            print(f'get total reply post: {len(pd_all_reply_tweets)}')
            
            if len(pd_all_reply_tweets) != 0:
                # id: current post id
                # conversation_id: original post id, i.e. the post id that is replied

                # pd_reply_tweets = pd_all_reply_tweets[
                #     pd_all_reply_tweets['conversation_id'].isin(id_list)
                # ]
                # reply_list = pd_reply_tweets.groupby(
                #     'conversation_id').size().to_list()

                pd_reply_tweets_list = [
                    pd_all_reply_tweets[
                        pd_all_reply_tweets['conversation_id'] == id
                    ]
                    for id in id_list
                ]

                pd_unique_user_list = [
                    pd_reply_tweets.groupby('user_id').size()
                    for pd_reply_tweets in pd_reply_tweets_list
                ]

                reply_count_list = [
                    len(pd_reply_tweets)
                    for pd_reply_tweets in pd_reply_tweets_list
                ]

                unique_user_count = [
                    len(pd_unique_user)
                    for pd_unique_user in pd_unique_user_list
                ]

                user_weight_list = [
                    0 if len(pd_unique_user) == 0 else
                    (0.9 ** (pd_unique_user - 1)).sum() / len(pd_unique_user)
                    for pd_unique_user in pd_unique_user_list
                ]

                influence_score_list = [
                    0 if reply_count == 0 else
                    user_weight * math.log10(reply_count + 1)
                    for user_weight, reply_count in zip(user_weight_list, reply_count_list)
                ]

        data = {
            'date': [begindate],
            'id': [id_list],
            'reply_count_list': [reply_count_list],
            'unique_user_count': [unique_user_count],
            'user_weight_list': [user_weight_list],
            'influence_score_list': [influence_score_list],
            user: [sum(reply_count_list)],
        }
        append_data = pd.DataFrame.from_dict(data)
        append_data.set_index('date', inplace=True)
        print('----------------------------------------------')
        print(append_data)

        append_data.to_csv(filename, mode='a', header=not os.path.exists(filename))
Example #29
0
def process_usernames(new_usernames):

    for username in new_usernames:
        try:
            user_name_df = pd.DataFrame()
            user_id_list = []
            user_handle_list = []
            user_name_list = []
            user_bio_list = []
            user_profile_image_list = []

            c = twint.Config()
            c.Username = username
            c.Store_object = True
            c.User_full = False
            c.Pandas =True
            twint.run.Lookup(c)
            user_df = twint.storage.panda.User_df.drop_duplicates(subset=['id'])
            user_id = list(user_df['id'])[0]
            user_name = list(user_df['name'])[0]
            user_bio = list(user_df['bio'])[0]
            user_profile_image = list(user_df['avatar'])[0]
            user_id_list.append(user_id)
            user_handle_list.append(username)
            user_name_list.append(user_name)
            user_bio_list.append(user_bio)
            user_profile_image_list.append(user_profile_image)

            
            user_name_df['Twitter_Handle'] = user_handle_list   
            user_name_df['Twitter_ID'] = user_id_list  
            user_name_df['Twitter_Name'] = user_name_list  
            user_name_df['Twitter_Bio'] = user_bio_list  
            user_name_df['Twitter_Profile_Image'] = user_profile_image_list
            print(user_name_df)
            save_to_mongodb(user_name_df)
            sleep(60)

        except:
            user_name_df = pd.DataFrame()
            user_id_list = []
            user_handle_list = []
            user_name_list = []
            user_bio_list = []
            user_profile_image_list = []

            print(username)
            user_id_list.append('NA')
            user_handle_list.append(username)
            user_name_list.append('NA')
            user_bio_list.append('NA')
            user_profile_image_list.append('NA')

            user_name_df['Twitter_Handle'] = user_handle_list   
            user_name_df['Twitter_ID'] = user_id_list  
            user_name_df['Twitter_Name'] = user_name_list  
            user_name_df['Twitter_Bio'] = user_bio_list  
            user_name_df['Twitter_Profile_Image'] = user_profile_image_list
            print(user_name_df)
            save_to_mongodb(user_name_df)
            sleep(200)
def spacysmscraper(text, number):
    # print("1")
    asyncio.set_event_loop(None)  # Clear the main loop.
    loop = asyncio.new_event_loop()  # Create a new loop.
    nest_asyncio.apply(loop)
    # print("2")
    # Part 1: for Reddit threads
    reddit = praw.Reddit(client_id='7hU5ZrX236KkyQ',
                         client_secret='c6pSBGl5Z2O1nwc-j-iuFhwGwfs',
                         redirect_uri='http://*****:*****@ \n\"\'"

    # Function for removing unknown characters
    remove_unknown_chars = lambda x: ''.join(char for char in x if char in symbols_to_keep)
    # Function for removing all Twitter user tags (@ongunuzaymacar, etc.)
    remove_user_tags = lambda x: re.sub(r'@\w+', '', x)
    # Function for removing all Twitter hashtags (#freetheworld, ect.)
    remove_hash_tags = lambda x: re.sub(r'#\w+', '', x)
    # Function for removing all URLs (www.google.com, etc.)
    remove_urls = lambda x: re.sub(r'(https://|www.)[A-Za-z0-9_.]+', '', x)

    def clean_tweets(twoot):
        # Convert to lowercase and remove spaces from beginning
        twoot = str(twoot).lstrip()
        # Remove Twitter-related data
        twoot = remove_user_tags(twoot)
        twoot = remove_urls(twoot)
        twoot = remove_hash_tags(twoot)
        # Remove unwanted characters
        twoot = remove_unknown_chars(twoot)
        # Remove spaces from end and condense multiple spaces into one
        twoot = twoot.rstrip()
        twoot = re.sub(' +', ' ', twoot)
        return twoot

    result["Text"] = result["Text"].apply(clean_tweets)
    f = lambda x: " ".join(x["Text"].split())
    result["Text"] = result.apply(f, axis=1)

    def ner(x):
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(x)
        textually = []
        tags = []
        for ent in doc.ents:
            textually.append(ent.text)
            tags.append(ent.label_)
        spacy_dictionary = dict(zip(textually, tags))
        good_terms = []
        for key in spacy_dictionary:
            if spacy_dictionary[key] == "ORG":
                good_terms.append(key)
            if spacy_dictionary[key] == "GPE":
                good_terms.append(key)
            if spacy_dictionary[key] == "LOC":
                good_terms.append(key)
            if spacy_dictionary[key] == "PRODUCT":
                good_terms.append(key)
            if spacy_dictionary[key] == "DATE":
                good_terms.append(key)
            if ("PRODUCT" not in spacy_dictionary.values()) and ("ORG" not in spacy_dictionary.values()):
                good_terms.clear()
            return good_terms

    result["NER Model"] = result["Text"].apply(ner)
    return result