def main():
    # csv = pd.read_csv('bernie_tweets.csv')
    # csv.replace(r'"', '', inplace=True, regex=True)
    # print(csv)
    tweets = Tweets(pd.read_csv('bernie_tweets2.csv', sep='^')).clean()

    tweets.to_csv('cleaned_bernie_tweets.csv', index=False)
Exemple #2
0
def query_tweets(request):
    """
    Returns tweet query
    """
    request_timeframe = Timeframe(start = request.REQUEST.get("start", None),
                                  end = request.REQUEST.get("end", None),
                                  interval = request.REQUEST.get("interval", "hour"))

    query_count = int(request.REQUEST.get("embedCount", TWEET_QUERY_COUNT))
    export = request.REQUEST.get("export", None)
    query = request.REQUEST.get("query", "")
    tweets = Tweets(query=query, query_count=query_count, start=request_timeframe.start, end=request_timeframe.end, export=export)
    response_data = {}
    if export == "csv":
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="export.csv"'
        writer = csv.writer(response)
        writer.writerow(['count','time','id','user_screen_name','user_id','status','retweet_count','favorite_count','is_retweet','in_reply_to_tweet_id','in_reply_to_screen_name'])
        count = 0;
        for t in tweets.get_data():
            count = count + 1
            body = t['body'].encode('ascii', 'replace')
            status_id = t['id']
            status_id = status_id[status_id.rfind(':')+1:]
            user_id = t['actor']['id']
            user_id = user_id[user_id.rfind(':')+1:]
            writer.writerow([count, t['postedTime'], status_id, t['actor']['preferredUsername'], user_id, body, t['retweetCount'], t['favoritesCount'], 'X', 'X', 'X'])
            return response
    else:
        response_data['tweets'] = tweets.get_data()
    return HttpResponse(json.dumps(response_data), content_type="application/json")
class ChiFeatureSelector:
    def __init__(self, class1, class2):
        # store the sets of tweets making up each bit of the training set
        self.class1 = Tweets(class1)
        self.class2 = Tweets(class2)

    def getScores(self):
        #chi-squared scores
        scores = {}

        # loop over the words in the extraction corpus \todo determine how to include things like retweet count
        for term in self.class1.getTerms():
            # build the chi-squared table
            n11 = float(self.class1.getTermCount(term))
            n10 = float(self.class2.getTermCount(term))
            n01 = float(self.class1.getDocCount() - n11)
            n00 = float(self.class2.getDocCount() - n10)

            # perform the chi-squared calculation and store
            # the score in the dictionary
            total = n11 + n10 + n01 + n00
            top = ((n11 * n00) - (n10 * n01)) ** 2
            bottom = (n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00)
            chi = (total * top) / bottom
            scores[term] = chi
            
        #note for format
        #for (v, k) in scores:
        #    print str(k) + " : " + str(v)
        return scores
Exemple #4
0
def posting_most_similar_words_on_Twitter():

    # インスタンスの生成
    tw = Tweets()
    maam = MorphologicalAnalysisAndModeling()

    # 検索ワードかつ類似度判定の基準となる単語を入力する
    search_word = input("検索ワードを入力してください >>> ")

    # search wordでTwitterを検索する。200ツイートを取得しリストtweet listにまとめる。
    tweet_list = tw.get_tweet_by_search(search_word=search_word, count=200)

    # MeCabで上記の200ツイートtweet listの形態素解析を行い、結果をリストresultsに出力する。
    results = maam.mecab(tweet_list)

    # Word2vecで形態素解析の結果をモデル化
    model = maam.word2vec(results)

    # search wordと類似した言葉を出力し、体裁を整えて文字列にする。
    words = ''
    for i in model.wv.most_similar(positive=[search_word]):
        words += str(i)[1:-20]
    words = words.replace("'", "")

    text = '{}と近い言葉は、{}'.format(search_word, words)

    # 上記の文字列をTwitterに投稿する。
    tw.posting_on_twitter(text=text)
Exemple #5
0
 def initialize(self):
     self.emit('hello', {'msg': 'alright!'})
     session = self.environ.get('beaker.session')
     if not session.has_key('access_token'):
         self.emit('failed_stream')
         return
     access_token = session['access_token']
     self.tweets = Tweets(consumer_token, access_token)
def main():
    tweets = Tweets()
    tweets.authentication()
    #tweets.getAllTweetsBySearch("Microsoft", 200)
    stockdata = StockData()
    #print(stockdata.getHistoricalDataByID('MSFT', "2018-01-01", "2018-02-02"))
    model = Model(tweets)
    model.getInputData('MSFT')
Exemple #7
0
def get_text(file_path):
    consumer_key = ""
    consumer_secret = ""
    access_key = ""
    access_secret = ""

    twitter = Tweets(consumer_key, consumer_secret, access_key, access_secret)

    hashtags = set()
    handles = []

    with open(file_path) as f:
        header = True
        for line in f:
            if header:
                header = False
                continue
            data = line.split(",")
            handles.append(data[:2])

    count = 1

    with open("../resources/tweets.csv", 'a') as f:
        for handle in handles:
            f.write(handle[0])
            f.write(",")
            f.write(handle[1])
            f.write(",")
            f.write(str(count))
            f.write("\n")

            tweets = twitter.get_all_tweets(handle[0])
            f.write(str(len(tweets)))
            f.write("\n")

            for tweet in tweets:
                if hasattr(tweet, 'retweeted_status'):
                    text = tweet.retweeted_status.full_text
                else:
                    text = tweet.full_text

                text = re.sub(r"http\S+", "", text.replace('\n', ' '))
                text = text.replace(',', '')
                text = text.strip()
                if text:
                    f.write(text)
                else:
                    f.write("empty")
                f.write("\n")

            print(f"Got tweets for {count}")
            count += 1
Exemple #8
0
def query_tweets(request):
    """
    Returns tweet query
    """
    request_timeframe = Timeframe(start=request.GET.get("start", None),
                                  end=request.GET.get("end", None),
                                  interval=request.GET.get("interval", "hour"))

    query_count = int(request.GET.get("embedCount", TWEET_QUERY_COUNT))
    export = request.GET.get("export", None)
    query = request.GET.get("query", "")

    try:
        tweets = Tweets(query=query,
                        query_count=query_count,
                        start=request_timeframe.start,
                        end=request_timeframe.end,
                        export=export)
    except GNIPQueryError as e:
        return handleQueryError(e)

    response_data = {}
    if export == "csv":
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="export.csv"'
        writer = csv.writer(response,
                            delimiter=',',
                            quotechar="'",
                            quoting=csv.QUOTE_ALL)
        writer.writerow([
            'count', 'time', 'id', 'user_screen_name', 'user_id', 'status',
            'retweet_count', 'favorite_count', 'is_retweet',
            'in_reply_to_tweet_id', 'in_reply_to_screen_name'
        ])
        count = 0
        for t in tweets.get_data():
            count = count + 1
            body = t['body'].encode('ascii', 'replace')
            status_id = t['id']
            status_id = status_id[status_id.rfind(':') + 1:]
            user_id = t['actor']['id']
            user_id = user_id[user_id.rfind(':') + 1:]
            writer.writerow([
                count, t['postedTime'], status_id,
                t['actor']['preferredUsername'], user_id, body,
                t['retweetCount'], t['favoritesCount'], 'X', 'X', 'X'
            ])
        return response
    else:
        response_data['tweets'] = tweets.get_data()
    return HttpResponse(json.dumps(response_data),
                        content_type="application/json")
Exemple #9
0
def search():

    keyword = request.form['keyword']
    wordList = re.sub("[^\w]", " ",
                      keyword).split()  # will strip punctuations later
    tw = Tweets()

    for i in wordList:
        tw.clean(i)

    #return to the template

    return render_template("index.html")
Exemple #10
0
 def initialize(self):
     self.emit('hello', {'msg':'alright!'})
     session = self.environ.get('beaker.session')
     if not session.has_key('access_token'):
         self.emit('failed_stream')
         return
     access_token = session['access_token']
     self.tweets = Tweets(consumer_token, access_token)
    def archive(self):
        # check and create directory
        if not path.exists(self.media_dir):
            mkdir(self.media_dir)

        if User(self.username, self.api).archive():
            Tweets(self.username, self.api).archive()
            Media(self.username, self.api).archive()
Exemple #12
0
 def run(self):
     logging.info("analyzer started")
     tweets = Tweets()
     while True:
         self.find_new_zh_user(tweets)
         self.find_active_zh_user(tweets)
         logging.info("sleep a while")
         time.sleep(30)
    def create_rtvt_aio_plot(self, tweets, retweets):

        tweet_coords = Tweets.tweets_per_minute(tweets)
        retweet_coords = Retweets.retweets_per_minute(retweets)

        tweet_coords.sort(key=lambda x: int(x[1]), reverse=True)
        retweet_coords.sort(key=lambda x: int(x[1]), reverse=True)

        return Graphs.rtwt_vs_twt_24h(retweet_coords, tweet_coords, __class__)
Exemple #14
0
 def collectTweets(self, output_dir="./", count=3200):
     """
     Returns the 3200 last tweets of every user in user_ids.
     """
     for user_id in self.user_ids:
         user_path = os.path.join(output_dir, user_id)
         if os.path.isfile(user_path):
             # if friends list already exists for this user
             continue
         tweets = Tweets(user_path, 'a+')
         i = 0
         max_id = 0
         keep_try = True
         while keep_try:
             try:
                 r = self.getUserStream(user_id, max_id=max_id)
                 if not r.get_iterator().results:
                     keep_try = False
                 for item in r.get_iterator():
                     if 'error' in response.keys() and response['error'] == 'Not authorized.':
                         cursor = 0
                         break
                     if 'message' in item.keys():
                         remaining = r.get_rest_quota()['remaining']
                         if not remaining:
                             sleep_min = 5
                             sleep_sec = sleep_min*60
                             self.__msg_wait(sleep_sec)
                             break
                         else:
                             sys.stderr.write(str(item) + "\n")
                     elif 'errors' in item.keys():
                         continue
                     else:
                         max_id = item['id'] - 1
                         tweets.append(item)
                         i += 1
                         if count and i >= count:
                             keep_try = False
                             break
             except Exception, e:
                 if item:
                     sys.stderr.write(str(item) + "\n")
                 raise e
Exemple #15
0
class TweetsNamespace(BaseNamespace, BroadcastMixin):

    # this will allow to broadcast events triggered from outside gevent-socketio
    # (e.g. when getting a message from the iOS app)
    __all__ = set()

    def __init__(self, *args, **kwargs):
        super(TweetsNamespace, self).__init__(*args, **kwargs)
        self.__class__.__all__.add(self)

    @classmethod
    def broadcast(cls, msg, data):
        for socket in cls.__all__:
            socket.emit(msg, data)

    def initialize(self):
        self.emit('hello', {'msg': 'alright!'})
        session = self.environ.get('beaker.session')
        if not session.has_key('access_token'):
            self.emit('failed_stream')
            return
        access_token = session['access_token']
        self.tweets = Tweets(consumer_token, access_token)

    def on_start_stream(self, data):
        logging.info(pprint.pformat(data))
        self.spawn(self.tweets.startStream, [t.strip() for t in data], [],
                   self.tweet_callback)

    def on_stop_stream(self):
        logging.info("stopping stream ...")
        self.tweets.stopStream()
        self.kill_local_jobs()

    def tweet_callback(self, tweet):
        self.emit('new_tweet', tweet)

    # this is here just as a reminder on how to spawn "jobs" in gevent-socketio
    def job_send_heart_beat(self):
        cnt = 0
        while True:
            self.emit('heart_beat', cnt)
            cnt += 1
            sleep(5)  # this is actually gevent.sleep (must be!)
Exemple #16
0
class TweetsNamespace(BaseNamespace, BroadcastMixin):

    # this will allow to broadcast events triggered from outside gevent-socketio
    # (e.g. when getting a message from the iOS app)
    __all__ = set()
    def __init__(self, *args, **kwargs):
        super(TweetsNamespace, self).__init__(*args, **kwargs)
        self.__class__.__all__.add(self)

    @classmethod
    def broadcast(cls, msg, data):
        for socket in cls.__all__:
            socket.emit(msg, data)


    def initialize(self):
        self.emit('hello', {'msg':'alright!'})
        session = self.environ.get('beaker.session')
        if not session.has_key('access_token'):
            self.emit('failed_stream')
            return
        access_token = session['access_token']
        self.tweets = Tweets(consumer_token, access_token)

    def on_start_stream(self, data):
        logging.info(pprint.pformat(data))
        self.spawn(self.tweets.startStream, [t.strip() for t in data], [], self.tweet_callback)

    def on_stop_stream(self):
        logging.info("stopping stream ...")
        self.tweets.stopStream()
        self.kill_local_jobs()

    def tweet_callback(self, tweet):
        self.emit('new_tweet', tweet)

    # this is here just as a reminder on how to spawn "jobs" in gevent-socketio
    def job_send_heart_beat(self):
        cnt = 0
        while True:
            self.emit('heart_beat', cnt)
            cnt += 1
            sleep(5)  # this is actually gevent.sleep (must be!)
Exemple #17
0
    def run(self):
        tweets = Tweets()
        while True:
            if not self.queue.size():
                logging.warning("queue is empty")
                time.sleep(CRAWLER_COLDDOWN_TIME)
                continue
            user_id = self.queue.pop()
            logging.info("fetching user %s." % user_id)
            """
            print (self.name, user_id)
            pull tweet, user's follower & friends
            push em to db.
            """
            self._push_to_db(tweets.get_user_timeline(user_id, count=50),
                             "tweets")
            self._push_to_db(tweets.get_user_list(user_id), "users")
            self._push_to_db(tweets.get_user_list(user_id,
                                                  url=TWITTER_FRIENDS_LIST),
                             "users")

            time.sleep(CRAWLER_COLDDOWN_TIME)
            self.queue.put(user_id)
Exemple #18
0
def main():

    t0 = time.time()

    folder = 'text/'

    tweets = Tweets.load_from_folder(folder)

    print "Tweets loaded {0}s".format(time.time() - t0)

    duration = 60
    results = []
    
    runs = [
        "nhl", "any", "nba", "nfl"
    ]
    
    for run in runs:
        t0 = time.time()
        for day in range(7,28):
            for hour in range(0,24):
                for minute in [0,15,30,45]:
                    end = datetime.datetime(2014, 11, day, hour=hour, minute=minute)
                    start = end - datetime.timedelta(seconds=60 * duration)

                    data = tweets.get_collection(start, end, run if run != 'any' else None)

                    if len(data) == 0:
                        break

                    result_date = start.strftime('%Y-%m-%d %H:%M') + " - " + end.strftime('%Y-%m-%d %H:%M')
                    result_clusters = cluster(data,5, [])
                    
                    results.append({"date": result_date, "clusters": result_clusters})

                    #dot()
                    print end, len(data)
        print

        with open("viz/" + run + "_15_60.json", "w") as f:
            json.dump(results, f)

        print run + ' done, ', time.time() - t0, 'seconds'
Exemple #19
0
def main():

    t0 = time.time()

    folder = 'text/'

    tweets = Tweets.load_from_folder(folder)

    print "Tweets loaded"

    duration = 30
    results = []

    for day in range(7,28):
        for hour in range(0,24):
            for minute in [0,15,30,45]:
                end = datetime.datetime(2014, 11, day, hour=hour, minute=minute)
                start = end - datetime.timedelta(seconds=60 * duration)

                data = tweets.get_collection(start, end)

                if len(data) == 0:
                    break

                result_date = start.strftime('%Y-%m-%d %H:%M') + " - " + end.strftime('%Y-%m-%d %H:%M')
                result_clusters = cluster(data,5, [])
                
                results.append({"date": result_date, "clusters": result_clusters})

                #dot()
                print end, len(data)
    print

    with open("viz/any_15_30.json", "w") as f:
        json.dump(results, f)

    print 'done, ', time.time() - t0, 'seconds'
Exemple #20
0
#!/usr/bin/env python

# import modules & set up logging
import gensim
import logging
from tweets import Tweets

# enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

# set up tweet iterator
tweet_dirs = [
    '../data/labeled_data/positive/', '../data/labeled_data/negative/'
]
tweets = Tweets(tweet_dirs)  # iterator that returns preprocessed tweets

# train word2vec on the tweets
model = gensim.models.Word2Vec(tweets, iter=10, min_count=5, size=100)

# save word2vec model
model.save('./word2vec_model')
    def create_tweet_cloud(self, tweets):

        word_string = Tweets.tweeted_words(tweets)

        return Graphs.tweeted_word_cloud(word_string, __class__)
    def user_data_box(self, tweets, retweets, statuses):

        tweepy_data = User.get_user_data_tweepy(self)

        account_created = tweepy_data[0]
        total_followers = tweepy_data[1]
        total_friends = tweepy_data[2]
        total_statuses = tweepy_data[3]
        geo_status = tweepy_data[4]
        screen_name = tweepy_data[5]

        # num of user's tweets in db
        total_on_record = len(statuses)

        # percentage of user's total tweets in db
        coverage = round(((len(statuses) / total_statuses) * 100), 2)
        coverage = str(coverage) + '%'

        # Users top 10 most mentioned users
        user_mentions = list(Mentions.users_mentioned(tweets)[1])
        user_mentions = sorted(user_mentions,
                               key=lambda x: int(x[1]),
                               reverse=True)
        user_mentions = user_mentions[:10]

        # top 10 hashtags used
        user_hashtags = Hashtags.get_user_hashtags(tweets)[0]
        top_hash = list(Hashtags.count_hashtags(user_hashtags))
        top_hash = sorted(top_hash, key=lambda x: int(x[1]), reverse=True)
        top_hash = top_hash[:10]

        # Users top 10 most retweeted users
        users_retweeted = Retweets.get_retweeted_users(retweets)
        users_retweeted = users_retweeted[0]

        count_retweeted = list(
            set([(x, users_retweeted.count(x)) for x in users_retweeted]))
        fav_retweeted = sorted(count_retweeted,
                               key=lambda x: int(x[1]),
                               reverse=True)
        fav_retweeted = fav_retweeted[:10]

        first_on_record = statuses[0]
        most_recent_on_record = statuses[-1]

        # favourite tweet time
        fav_time = Tweets.tweets_per_minute(statuses)
        fav_time = sorted(fav_time, key=lambda x: int(x[1]), reverse=True)
        fav_time = fav_time[0]

        # day with most statuses sent
        busy_day = Tweets.tweets_per_date(statuses)
        busy_day = sorted(busy_day, key=lambda x: int(x[1]), reverse=True)
        busy_day = busy_day[0]

        # most used medium to tweet
        sources = [i[4] for i in statuses]
        fav_source = Sources.counted_sources(sources)
        fav_source = sorted(fav_source, key=lambda x: int(x[1]), reverse=True)
        fav_source = fav_source[0]

        return (screen_name, account_created, total_statuses, total_on_record,
                total_followers, total_friends, geo_status, first_on_record,
                most_recent_on_record, coverage, top_hash, fav_retweeted,
                user_mentions, fav_time, busy_day, fav_source)
Exemple #23
0
 def process_update(self, update, *args, **kwargs):
  update = Tweets.process_update(self, update)
  update.sort(key=lambda a: calendar.timegm(rfc822.parsedate(a['created_at'])))
  return update
Exemple #24
0
def update_tweets():
    t = Tweets()
    t.update()
    t.process()
Exemple #25
0
import http.server
import socketserver
from http import HTTPStatus
from urllib.parse import urlparse, parse_qs
from urllib import parse
from tweets import Tweets
import json
import os
import files
import time

from http.server import HTTPServer
from socketserver import ThreadingMixIn

tweet = Tweets()


# Fichier de test sur la concurrence et le sleep + timeout du serveur
class Handler(http.server.SimpleHTTPRequestHandler):
    def do_OPTIONS(self):
        self.send_response(200, "ok")

        self.send_header('Access-Control-Allow-Origin', '*')
        self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS')
        self.send_header("Access-Control-Allow-Headers", "X-Requested-With")
        self.send_header("Access-Control-Allow-Headers", "Content-Type")

        self.end_headers()

    def do_GET(self):
        self.my_params = parse_qs(urlparse(self.path).query)
Exemple #26
0
def main(arguments):

    # Parse optional filename arguments
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-p', '--positive-tweets', dest='pos_dir',
                        help="Directory of example positive tweets",
                        default="../data/labeled_data/positive/")
    parser.add_argument('-n', '--negative-tweets', dest='neg_dir',
                        help="Directory of example negative tweets",
                        default="../data/labeled_data/negative/")
    parser.add_argument('-c', '--sample-count', dest='sample_count',
                        help="Max number of samples of each sentiment",
                        default="800000")

    args = parser.parse_args(arguments)

    # Create Tweets Iterators
    update("Creating tweet iterators...")
    pos_tweets_iter = Tweets([args.pos_dir])
    neg_tweets_iter = Tweets([args.neg_dir])
    update()

    # Save situtations to lists and shuffle
    update("Loading positive tweets...")
    pos_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter]
    update()

    update("Loading negative tweets...")
    neg_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter]
    update()

    update("Selecting balanced sample sets...")
    sample_count = int(args.sample_count)
    pos_tweets = resample(pos_tweets, n_samples=sample_count,
                              replace=False, random_state=1)
    neg_tweets = resample(neg_tweets, n_samples=sample_count,
                              replace=False, random_state=2)
    update()

    # Shuffle tweets and split into training, dev, and test
    update("Shuffle tweets and split into training, dev, and test sets...")
    pos_labels = [1 for _ in pos_tweets]
    neg_labels = [0 for _ in neg_tweets]

    tweets = np.append(pos_tweets, neg_tweets)
    labels = np.append(pos_labels, neg_labels)

    tweets, labels = shuffle(tweets, labels, random_state=2)
    size = len(labels)
    train = slice(0, int(0.8 * size))
    dev = slice(int(0.8 * size), int(0.9 * size))
    test = slice(int(0.8 * size), size - 1)
    update()
    print()

    # Build Pipeline
    print("Performing grid search...")
    pipeline = Pipeline([('vect', CountVectorizer()),
                         #('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])

    parameters = { #TODO check which parameters actually effect use in sarcasm detection
            'vect__tokenizer': [tokenizer],
            'vect__stop_words': [None],
            'vect__binary': [False],
            'vect__ngram_range': [(1,5)], 
            #'tfidf__norm': [None, 'l1', 'l2'],
            #'tfidf__use_idf': [True, False],
            #'tfidf__smooth_idf': [True, False],
            #'tfidf__sublinear_tf': [True, False],
            'clf__alpha': [1.0], # check range, these are guesses
            'clf__fit_prior': [False], # not sure what the distribution in sarcasm data is
    }

    clf_pipe = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    clf_pipe.fit(tweets[train], labels[train])
    print("Done in %0.3fs" % (time() - t0))
    print()

    # Print grid search results
    print("Best score: %0.3f" % clf_pipe.best_score_)
    print("Best parameters set:")
    best_parameters = clf_pipe.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print()

    # Evaluate classifier
    vect  = clf_pipe.best_estimator_.named_steps['vect']
    #tfidf = clf_pipe.best_estimator_.named_steps['tfidf']
    clf   = clf_pipe.best_estimator_.named_steps['clf']
    predicted = clf_pipe.predict(tweets[test])

    print("Classifier Evaluation:")
    print(metrics.classification_report(labels[test], predicted,
                                        target_names=["-", "+"]))

    # save classifier
    pickle.dump(clf_pipe, open(MODEL_FNAME, 'wb'))
Exemple #27
0
    def get_friends_tweets():

        return Tweets.get_all_tweets()
Exemple #28
0
from tweets import Tweets
from db import DB

tweets = Tweets()
tweets.getUserID()

# test run to clean tweets of 3 users
# tweets.textClean(tweets.userIDs[0:3])

# clean the tweets of all the users
# NOTE
#   Remember to check the 'breakpoint' setting in config.py
# to ensure the operation is a continue clean process on previous one
# or an overwrite clean process all over again
tweets.textClean()
def main(arguments):

    # enable logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        filename=LOG_FNAME,
                        level=logging.INFO)

    # parse optional filename arguments
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-s',
                        '--sartic-tweets',
                        dest='sar_dir',
                        help="Directory of example sartic tweets",
                        default="../data/labeled_data/sarcastic/")
    parser.add_argument('-p',
                        '--positive-tweets',
                        dest='pos_dir',
                        help="Directory of example positive tweets",
                        default="../data/labeled_data/positive/")
    parser.add_argument('-n',
                        '--negative-tweets',
                        dest='neg_dir',
                        help="Directory of example negative tweets",
                        default="../data/labeled_data/negative/")
    parser.add_argument(
        '-c',
        '--sample-count',
        dest='sample_count',
        help="Max number of samples of each class",
        default="10000")  # 10k default, ~300k max with current data

    args = parser.parse_args(arguments)

    # set random seed
    np.random.seed(RAND_SEED)

    # create tweets iterators
    log_print("Creating tweet iterators...")
    sar_tweets_iter = Tweets([args.sar_dir])
    pos_tweets_iter = Tweets([args.pos_dir])
    neg_tweets_iter = Tweets([args.neg_dir])
    log_print()

    # load tweets with gold labels filtered to lists and shuffle
    log_print("Loading sarcastic tweets with gold labels filtered...")
    sar_tweets = [Tweets.filter_tags(tweet) for tweet in sar_tweets_iter]
    log_print("...loaded {} sarcastic tweets".format(len(sar_tweets)))

    log_print("Loading non-sarcastic tweets...")
    pos_tweets = [Tweets.filter_tags(tweet)
                  for tweet in pos_tweets_iter]  # filter gold label hashtags
    log_print("...loaded {} positive tweets...".format(len(pos_tweets)))
    neg_tweets = [Tweets.filter_tags(tweet) for tweet in neg_tweets_iter]
    log_print("...loaded {} negative tweets".format(len(neg_tweets)))

    log_print(
        "Selecting balanced sample sets of {} tweets per class...".format(
            args.sample_count))
    sample_count = int(args.sample_count)
    sar_tweets = resample(sar_tweets,
                          n_samples=sample_count,
                          replace=False,
                          random_state=1)
    pos_tweets = resample(pos_tweets,
                          n_samples=sample_count // 2,
                          replace=False,
                          random_state=2)
    neg_tweets = resample(neg_tweets,
                          n_samples=sample_count // 2,
                          replace=False,
                          random_state=3)
    non_tweets = pos_tweets + neg_tweets
    log_print()

    # shuffle tweets and split into training, dev, and test
    log_print("Shuffle all tweets...")
    sar_labels = [1 for _ in sar_tweets]
    non_labels = [0 for _ in non_tweets]

    tweets = np.append(sar_tweets, non_tweets)
    labels = np.append(sar_labels, non_labels)

    tweets, labels = shuffle(tweets, labels, random_state=4)
    log_print()

    # write to output file
    log_print("write to files as training, dev, and test sets...")
    output_gen = (n for n in zip(tweets, labels)
                  )  # generator of (tweet, label) tuples
    with open(OUTFNAME_FORMAT.format("test"), "w+") as f:
        for tweet, label in itertools.islice(output_gen, sample_count // 10):
            f.write("{}\t{}\n".format(label, ' '.join(tweet)))
    with open(OUTFNAME_FORMAT.format("dev"), "w+") as f:
        for tweet, label in itertools.islice(output_gen, sample_count // 10):
            f.write("{}\t{}\n".format(label, ' '.join(tweet)))
    with open(OUTFNAME_FORMAT.format("train"), "w+") as f:
        for tweet, label in output_gen:
            f.write("{}\t{}\n".format(label, ' '.join(tweet)))

    log_print(
        "...training, dev, and test sets written to files {}, {}, and {}".
        format(OUTFNAME_FORMAT.format("train"), OUTFNAME_FORMAT.format("dev"),
               OUTFNAME_FORMAT.format("test")))
    def user_tweets(self):

        all_tweets = Tweets.get_all_tweets()
        all_tweets = [x for x in all_tweets if x[5] == self.user]

        return all_tweets
Exemple #31
0
	def _classifyUser_onethread(self, forbid, auto_hash, requested, sporty,
				    classifiers, users_dir, uids, label_names,
				    probability, i, stdout_lock, raw=False):
		def print_results(t):
			stdout_lock.acquire()
			uid = t[0]
			scores = t[1]
			if raw:
				print json.dumps(scores)
			else:
				print "%s,%s" % (uid, ",".join(map(str,scores)))
			sys.stdout.flush()
			stdout_lock.release()

		while True:
			uid = uids.get()
			if uid is None:
				logger.debug("%d - Exiting" % i)
				return
			logger.debug("%d - Processing %s" % (i, uid))
			utweets = Tweets(os.path.join(users_dir, str(uid)))

			# removing sport tracker tweets
			no_sport_utweets = utweets.filter_on_hashtags(forbid, 'remove')
			if no_sport_utweets.size() < utweets.size(): 
				# some sporty tweets have been removed
				if not sporty:  # user is not supposed to be exercising
					logger.info("no_sport user %s is exercising" % uid)
					continue

			# removing tweets generated by well-known apps
			filtered_utweets = no_sport_utweets.filter_on_hashtags(auto_hash, 'remove')
			score_denom = filtered_utweets.size()

			poms_tweets = filtered_utweets.tolist()
			if requested:
				poms_tweets = filtered_utweets.filter_on_text(requested, 'keep').tolist()

			if not poms_tweets:
				logger.info("no tweets for %s" % uid)
				continue
			else:
				user = poms_tweets[0]['user']
				if user['lang'] != 'en':
					logger.info("user %s lang is not en" % uid)
					continue
			X = self.buildX(poms_tweets, predict=True)
			
			preds = []
			for label in label_names:
				# raw predicted probability scores
				if raw:
					preds.append(classifiers[label].predict_proba(X).tolist())
				# classification
				else:
					y_pred_proba = classifiers[label].predict_proba(X)[:, 1]
					pred = map(lambda x: 0 if x < probability else 1,
							   y_pred_proba)
					ones = float(np.count_nonzero(pred))
					score = ones/score_denom
					preds.append(score)
			print_results((uid,preds))
Exemple #32
0
def main(pos_dir, neg_dir, sar_dir, random_seed):
    np.random.seed(random_seed)

    # Create tweets iterators
    update("Creating tweet iterators...")
    pos_tweets_iter = Tweets([pos_dir])
    neg_tweets_iter = Tweets([neg_dir])
    sar_tweets_iter = Tweets([sar_dir])
    update()

    # Save situtations to lists and shuffle
    update("Loading positive tweets...")
    pos_tweets = [
        ' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter
    ]
    pos_tweets = shuffle(pos_tweets)
    update()

    update("Loading negative tweets...")
    neg_tweets = [
        ' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter
    ]
    neg_tweets = shuffle(neg_tweets)
    update()

    update("Loading sarcastic tweets...")
    sar_tweets = [
        ' '.join(Tweets.filter_tags(tweet)) for tweet in sar_tweets_iter
    ]
    sar_tweets = shuffle(sar_tweets)
    update()

    # Save sarcasm data
    update("Saving sarcasm data...")
    count = len(sar_tweets)
    print("len pos_tweets before take = {}".format(len(pos_tweets)))
    non_sar_tweets = take(pos_tweets, count // 2) + take(
        neg_tweets, count // 2)
    print("len pos_tweets after take = {}".format(len(pos_tweets)))
    sar_labels = [1 for _ in sar_tweets]
    non_sar_labels = [0 for _ in non_sar_tweets]

    sarcasm_data = np.append(sar_tweets, non_sar_tweets)
    sarcasm_labels = np.append(sar_labels, non_sar_labels)

    sarcasm_data, sarcasm_labels = shuffle(sarcasm_data, sarcasm_labels)

    size = len(sarcasm_data)
    train = slice(0, int(0.8 * size))
    dev = slice(int(0.8 * size), int(0.9 * size))
    test = slice(int(0.8 * size), size - 1)

    sarcasm_dump = {
        "train": (sarcasm_data[train], sarcasm_labels[train]),
        "dev": (sarcasm_data[dev], sarcasm_labels[dev]),
        "test": (sarcasm_data[test], sarcasm_labels[test])
    }

    pickle.dump(sarcasm_dump,
                open(os.path.join(SPLIT_DATA_DIR, "sarcasm.pkl"), 'wb'))
    update()

    # Save sentiment data
    update("Saving sentiment data...")
    count = min(len(pos_tweets), len(neg_tweets))
    pos_tweets = pos_tweets[:count]
    neg_tweets = neg_tweets[:count]
    pos_labels = [1 for _ in pos_tweets]
    neg_labels = [0 for _ in neg_tweets]

    sentiment_data = np.append(pos_tweets, neg_tweets)
    sentiment_labels = np.append(pos_labels, neg_labels)

    sentiment_data, sentiment_labels = shuffle(sentiment_data,
                                               sentiment_labels)

    size = len(sentiment_data)
    train = slice(0, int(0.8 * size))
    dev = slice(int(0.8 * size), int(0.9 * size))
    test = slice(int(0.8 * size), size - 1)

    sentiment_dump = {
        "train": (sentiment_data[train], sentiment_labels[train]),
        "dev": (sentiment_data[dev], sentiment_labels[dev]),
        "test": (sentiment_data[test], sentiment_labels[test])
    }

    pickle.dump(sentiment_dump,
                open(os.path.join(SPLIT_DATA_DIR, "sentiment.pkl"), 'wb'))
    update()
Exemple #33
0
 def process_update(self, update, *args, **kwargs):
     update = Tweets.process_update(self, update)
     update.sort(
         key=lambda a: calendar.timegm(rfc822.parsedate(a['created_at'])))
     return update
def query_tweets(request):
    """
    Returns tweet query
    """
    query_count = 10000  # int(request.GET.get("embedCount", TWEET_QUERY_COUNT))
    export = request.GET.get("export", None)
    query = request.GET.get("query", "")
    tweets = Tweets(query=query, query_count=query_count, request=request)

    response_data = {}

    if export == "ta":
        output = StringIO.StringIO()
        for t in tweets.get_data():
            user_id = t['actor']['id']
            output.write(user_id + '\n')
        ton_request = ton.TwitterTon(
            twitter_consumer_key=settings.SOCIAL_AUTH_TWITTER_KEY,
            twitter_consumer_secret=settings.SOCIAL_AUTH_TWITTER_SECRET,
            access_token=settings.TWITTER_ACCESS_TOKEN,
            access_token_secret=settings.TWITTER_ACCESS_TOKEN_SECRET)
        bytes = output.getvalue()
        ton_response = ton_request.upload_data(
            payload=bytes.encode('utf-16be'))
        output.close()
        location = ton_response['location']
        response = HttpResponse(json.dumps(
            {"location": location, "query": query}), content_type="application/json")
        return response

    elif export == "csv":
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="export.csv"'
        writer = csv.writer(response)
        writer.writerow(['count',
                         'time',
                         'id',
                         'user_screen_name',
                         'user_id',
                         'status',
                         'retweet_count',
                         'favorite_count',
                         'is_retweet',
                         'in_reply_to_tweet_id',
                         'in_reply_to_screen_name'])
        count = 0
        for t in tweets.get_data():
            count = count + 1
            body = t['body'].encode('ascii', 'replace')
            status_id = t['id']
            status_id = status_id[status_id.rfind(':') + 1:]
            user_id = t['actor']['id']
            user_id = user_id[user_id.rfind(':') + 1:]
            writer.writerow([count,
                             t['postedTime'],
                             status_id,
                             t['actor']['preferredUsername'],
                             user_id,
                             body,
                             t['retweetCount'],
                             t['favoritesCount'],
                             'X',
                             'X',
                             'X'])
            return response
    else:
        response_data['tweets'] = tweets.get_data()
        response = HttpResponse(
            json.dumps(response_data),
            content_type="application/json")
        response['Cache-Control'] = 'max-age=%d' % MAX_AGE
        return response
Exemple #35
0
def update_tweets(ticker):
    return Tweets(ticker)
 def __init__(self, class1, class2):
     # store the sets of tweets making up each bit of the training set
     self.class1 = Tweets(class1)
     self.class2 = Tweets(class2)
Exemple #37
0
    def create_aio_plot(tweet_list):

        graph_coords = Tweets.tweets_per_minute(tweet_list)
        graph_coords.sort(key=lambda x: int(x[1]), reverse=True)

        return Graphs.all_in_one(graph_coords, __class__)
Exemple #38
0
def main(arguments):

    # Parse optional filename arguments
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-p',
                        '--positive-tweets',
                        dest='pos_dir',
                        help="Directory of example positive tweets",
                        default="../../data/labeled_data/positive/")
    parser.add_argument('-n',
                        '--negative-tweets',
                        dest='neg_dir',
                        help="Directory of example negative tweets",
                        default="../../data/labeled_data/negative/")
    parser.add_argument('-c',
                        '--sample-count',
                        dest='sample_count',
                        help="Max number of samples of each sentiment",
                        default="10")

    args = parser.parse_args(arguments)

    # Create Tweets Iterators
    update("Creating tweet iterators...")
    pos_tweets_iter = Tweets([args.pos_dir])
    neg_tweets_iter = Tweets([args.neg_dir])
    update()

    # Save situtations to lists and shuffle
    update("Loading positive tweets...")
    pos_tweets = [' '.join(tweet) for tweet in pos_tweets_iter]
    update()

    update("Loading negative tweets...")
    neg_tweets = [' '.join(tweet) for tweet in neg_tweets_iter]
    update()

    update("Selecting balanced sample sets...")
    sample_count = int(args.sample_count)
    pos_tweets = resample(pos_tweets,
                          n_samples=sample_count,
                          replace=False,
                          random_state=1)
    neg_tweets = resample(neg_tweets,
                          n_samples=sample_count,
                          replace=False,
                          random_state=2)
    update()

    # Shuffle tweets and split into training, dev, and test
    update("Shuffle tweets and split into training, dev, and test sets...")
    pos_labels = [1 for _ in pos_tweets]
    neg_labels = [0 for _ in neg_tweets]

    tweets = np.append(pos_tweets, neg_tweets)
    labels = np.append(pos_labels, neg_labels)

    tweets, labels = shuffle(tweets, labels, random_state=2)
    size = len(labels)
    train = slice(0, int(0.8 * size))
    dev = slice(int(0.8 * size), int(0.9 * size))
    test = slice(int(0.8 * size), size - 1)
    update()
    print()
    clf_pipe = pickle.load(open(MODEL_FNAME, 'rb'))

    # Evaluate classifier
    vect = clf_pipe.best_estimator_.named_steps['vect']
    clf = clf_pipe.best_estimator_.named_steps['clf']
    predicted = clf_pipe.predict(tweets[test])

    print("Classifier Evaluation:")
    print(
        metrics.classification_report(labels[test],
                                      predicted,
                                      target_names=["-", "+"]))
from tweets import Tweets

tweet = Tweets()

print("Nombre de tweets dans tout le dataframe: ", tweet.get_number_tweets())
print('Tweets pourcentage per country: ', tweet.get_number_tweets_countries())
print("Dictionnaire pour la France: ",
      tweet.get_number_tweets_country("France", ""))