def main(): # csv = pd.read_csv('bernie_tweets.csv') # csv.replace(r'"', '', inplace=True, regex=True) # print(csv) tweets = Tweets(pd.read_csv('bernie_tweets2.csv', sep='^')).clean() tweets.to_csv('cleaned_bernie_tweets.csv', index=False)
def query_tweets(request): """ Returns tweet query """ request_timeframe = Timeframe(start = request.REQUEST.get("start", None), end = request.REQUEST.get("end", None), interval = request.REQUEST.get("interval", "hour")) query_count = int(request.REQUEST.get("embedCount", TWEET_QUERY_COUNT)) export = request.REQUEST.get("export", None) query = request.REQUEST.get("query", "") tweets = Tweets(query=query, query_count=query_count, start=request_timeframe.start, end=request_timeframe.end, export=export) response_data = {} if export == "csv": response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="export.csv"' writer = csv.writer(response) writer.writerow(['count','time','id','user_screen_name','user_id','status','retweet_count','favorite_count','is_retweet','in_reply_to_tweet_id','in_reply_to_screen_name']) count = 0; for t in tweets.get_data(): count = count + 1 body = t['body'].encode('ascii', 'replace') status_id = t['id'] status_id = status_id[status_id.rfind(':')+1:] user_id = t['actor']['id'] user_id = user_id[user_id.rfind(':')+1:] writer.writerow([count, t['postedTime'], status_id, t['actor']['preferredUsername'], user_id, body, t['retweetCount'], t['favoritesCount'], 'X', 'X', 'X']) return response else: response_data['tweets'] = tweets.get_data() return HttpResponse(json.dumps(response_data), content_type="application/json")
class ChiFeatureSelector: def __init__(self, class1, class2): # store the sets of tweets making up each bit of the training set self.class1 = Tweets(class1) self.class2 = Tweets(class2) def getScores(self): #chi-squared scores scores = {} # loop over the words in the extraction corpus \todo determine how to include things like retweet count for term in self.class1.getTerms(): # build the chi-squared table n11 = float(self.class1.getTermCount(term)) n10 = float(self.class2.getTermCount(term)) n01 = float(self.class1.getDocCount() - n11) n00 = float(self.class2.getDocCount() - n10) # perform the chi-squared calculation and store # the score in the dictionary total = n11 + n10 + n01 + n00 top = ((n11 * n00) - (n10 * n01)) ** 2 bottom = (n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00) chi = (total * top) / bottom scores[term] = chi #note for format #for (v, k) in scores: # print str(k) + " : " + str(v) return scores
def posting_most_similar_words_on_Twitter(): # インスタンスの生成 tw = Tweets() maam = MorphologicalAnalysisAndModeling() # 検索ワードかつ類似度判定の基準となる単語を入力する search_word = input("検索ワードを入力してください >>> ") # search wordでTwitterを検索する。200ツイートを取得しリストtweet listにまとめる。 tweet_list = tw.get_tweet_by_search(search_word=search_word, count=200) # MeCabで上記の200ツイートtweet listの形態素解析を行い、結果をリストresultsに出力する。 results = maam.mecab(tweet_list) # Word2vecで形態素解析の結果をモデル化 model = maam.word2vec(results) # search wordと類似した言葉を出力し、体裁を整えて文字列にする。 words = '' for i in model.wv.most_similar(positive=[search_word]): words += str(i)[1:-20] words = words.replace("'", "") text = '{}と近い言葉は、{}'.format(search_word, words) # 上記の文字列をTwitterに投稿する。 tw.posting_on_twitter(text=text)
def initialize(self): self.emit('hello', {'msg': 'alright!'}) session = self.environ.get('beaker.session') if not session.has_key('access_token'): self.emit('failed_stream') return access_token = session['access_token'] self.tweets = Tweets(consumer_token, access_token)
def main(): tweets = Tweets() tweets.authentication() #tweets.getAllTweetsBySearch("Microsoft", 200) stockdata = StockData() #print(stockdata.getHistoricalDataByID('MSFT', "2018-01-01", "2018-02-02")) model = Model(tweets) model.getInputData('MSFT')
def get_text(file_path): consumer_key = "" consumer_secret = "" access_key = "" access_secret = "" twitter = Tweets(consumer_key, consumer_secret, access_key, access_secret) hashtags = set() handles = [] with open(file_path) as f: header = True for line in f: if header: header = False continue data = line.split(",") handles.append(data[:2]) count = 1 with open("../resources/tweets.csv", 'a') as f: for handle in handles: f.write(handle[0]) f.write(",") f.write(handle[1]) f.write(",") f.write(str(count)) f.write("\n") tweets = twitter.get_all_tweets(handle[0]) f.write(str(len(tweets))) f.write("\n") for tweet in tweets: if hasattr(tweet, 'retweeted_status'): text = tweet.retweeted_status.full_text else: text = tweet.full_text text = re.sub(r"http\S+", "", text.replace('\n', ' ')) text = text.replace(',', '') text = text.strip() if text: f.write(text) else: f.write("empty") f.write("\n") print(f"Got tweets for {count}") count += 1
def query_tweets(request): """ Returns tweet query """ request_timeframe = Timeframe(start=request.GET.get("start", None), end=request.GET.get("end", None), interval=request.GET.get("interval", "hour")) query_count = int(request.GET.get("embedCount", TWEET_QUERY_COUNT)) export = request.GET.get("export", None) query = request.GET.get("query", "") try: tweets = Tweets(query=query, query_count=query_count, start=request_timeframe.start, end=request_timeframe.end, export=export) except GNIPQueryError as e: return handleQueryError(e) response_data = {} if export == "csv": response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="export.csv"' writer = csv.writer(response, delimiter=',', quotechar="'", quoting=csv.QUOTE_ALL) writer.writerow([ 'count', 'time', 'id', 'user_screen_name', 'user_id', 'status', 'retweet_count', 'favorite_count', 'is_retweet', 'in_reply_to_tweet_id', 'in_reply_to_screen_name' ]) count = 0 for t in tweets.get_data(): count = count + 1 body = t['body'].encode('ascii', 'replace') status_id = t['id'] status_id = status_id[status_id.rfind(':') + 1:] user_id = t['actor']['id'] user_id = user_id[user_id.rfind(':') + 1:] writer.writerow([ count, t['postedTime'], status_id, t['actor']['preferredUsername'], user_id, body, t['retweetCount'], t['favoritesCount'], 'X', 'X', 'X' ]) return response else: response_data['tweets'] = tweets.get_data() return HttpResponse(json.dumps(response_data), content_type="application/json")
def search(): keyword = request.form['keyword'] wordList = re.sub("[^\w]", " ", keyword).split() # will strip punctuations later tw = Tweets() for i in wordList: tw.clean(i) #return to the template return render_template("index.html")
def initialize(self): self.emit('hello', {'msg':'alright!'}) session = self.environ.get('beaker.session') if not session.has_key('access_token'): self.emit('failed_stream') return access_token = session['access_token'] self.tweets = Tweets(consumer_token, access_token)
def archive(self): # check and create directory if not path.exists(self.media_dir): mkdir(self.media_dir) if User(self.username, self.api).archive(): Tweets(self.username, self.api).archive() Media(self.username, self.api).archive()
def run(self): logging.info("analyzer started") tweets = Tweets() while True: self.find_new_zh_user(tweets) self.find_active_zh_user(tweets) logging.info("sleep a while") time.sleep(30)
def create_rtvt_aio_plot(self, tweets, retweets): tweet_coords = Tweets.tweets_per_minute(tweets) retweet_coords = Retweets.retweets_per_minute(retweets) tweet_coords.sort(key=lambda x: int(x[1]), reverse=True) retweet_coords.sort(key=lambda x: int(x[1]), reverse=True) return Graphs.rtwt_vs_twt_24h(retweet_coords, tweet_coords, __class__)
def collectTweets(self, output_dir="./", count=3200): """ Returns the 3200 last tweets of every user in user_ids. """ for user_id in self.user_ids: user_path = os.path.join(output_dir, user_id) if os.path.isfile(user_path): # if friends list already exists for this user continue tweets = Tweets(user_path, 'a+') i = 0 max_id = 0 keep_try = True while keep_try: try: r = self.getUserStream(user_id, max_id=max_id) if not r.get_iterator().results: keep_try = False for item in r.get_iterator(): if 'error' in response.keys() and response['error'] == 'Not authorized.': cursor = 0 break if 'message' in item.keys(): remaining = r.get_rest_quota()['remaining'] if not remaining: sleep_min = 5 sleep_sec = sleep_min*60 self.__msg_wait(sleep_sec) break else: sys.stderr.write(str(item) + "\n") elif 'errors' in item.keys(): continue else: max_id = item['id'] - 1 tweets.append(item) i += 1 if count and i >= count: keep_try = False break except Exception, e: if item: sys.stderr.write(str(item) + "\n") raise e
class TweetsNamespace(BaseNamespace, BroadcastMixin): # this will allow to broadcast events triggered from outside gevent-socketio # (e.g. when getting a message from the iOS app) __all__ = set() def __init__(self, *args, **kwargs): super(TweetsNamespace, self).__init__(*args, **kwargs) self.__class__.__all__.add(self) @classmethod def broadcast(cls, msg, data): for socket in cls.__all__: socket.emit(msg, data) def initialize(self): self.emit('hello', {'msg': 'alright!'}) session = self.environ.get('beaker.session') if not session.has_key('access_token'): self.emit('failed_stream') return access_token = session['access_token'] self.tweets = Tweets(consumer_token, access_token) def on_start_stream(self, data): logging.info(pprint.pformat(data)) self.spawn(self.tweets.startStream, [t.strip() for t in data], [], self.tweet_callback) def on_stop_stream(self): logging.info("stopping stream ...") self.tweets.stopStream() self.kill_local_jobs() def tweet_callback(self, tweet): self.emit('new_tweet', tweet) # this is here just as a reminder on how to spawn "jobs" in gevent-socketio def job_send_heart_beat(self): cnt = 0 while True: self.emit('heart_beat', cnt) cnt += 1 sleep(5) # this is actually gevent.sleep (must be!)
class TweetsNamespace(BaseNamespace, BroadcastMixin): # this will allow to broadcast events triggered from outside gevent-socketio # (e.g. when getting a message from the iOS app) __all__ = set() def __init__(self, *args, **kwargs): super(TweetsNamespace, self).__init__(*args, **kwargs) self.__class__.__all__.add(self) @classmethod def broadcast(cls, msg, data): for socket in cls.__all__: socket.emit(msg, data) def initialize(self): self.emit('hello', {'msg':'alright!'}) session = self.environ.get('beaker.session') if not session.has_key('access_token'): self.emit('failed_stream') return access_token = session['access_token'] self.tweets = Tweets(consumer_token, access_token) def on_start_stream(self, data): logging.info(pprint.pformat(data)) self.spawn(self.tweets.startStream, [t.strip() for t in data], [], self.tweet_callback) def on_stop_stream(self): logging.info("stopping stream ...") self.tweets.stopStream() self.kill_local_jobs() def tweet_callback(self, tweet): self.emit('new_tweet', tweet) # this is here just as a reminder on how to spawn "jobs" in gevent-socketio def job_send_heart_beat(self): cnt = 0 while True: self.emit('heart_beat', cnt) cnt += 1 sleep(5) # this is actually gevent.sleep (must be!)
def run(self): tweets = Tweets() while True: if not self.queue.size(): logging.warning("queue is empty") time.sleep(CRAWLER_COLDDOWN_TIME) continue user_id = self.queue.pop() logging.info("fetching user %s." % user_id) """ print (self.name, user_id) pull tweet, user's follower & friends push em to db. """ self._push_to_db(tweets.get_user_timeline(user_id, count=50), "tweets") self._push_to_db(tweets.get_user_list(user_id), "users") self._push_to_db(tweets.get_user_list(user_id, url=TWITTER_FRIENDS_LIST), "users") time.sleep(CRAWLER_COLDDOWN_TIME) self.queue.put(user_id)
def main(): t0 = time.time() folder = 'text/' tweets = Tweets.load_from_folder(folder) print "Tweets loaded {0}s".format(time.time() - t0) duration = 60 results = [] runs = [ "nhl", "any", "nba", "nfl" ] for run in runs: t0 = time.time() for day in range(7,28): for hour in range(0,24): for minute in [0,15,30,45]: end = datetime.datetime(2014, 11, day, hour=hour, minute=minute) start = end - datetime.timedelta(seconds=60 * duration) data = tweets.get_collection(start, end, run if run != 'any' else None) if len(data) == 0: break result_date = start.strftime('%Y-%m-%d %H:%M') + " - " + end.strftime('%Y-%m-%d %H:%M') result_clusters = cluster(data,5, []) results.append({"date": result_date, "clusters": result_clusters}) #dot() print end, len(data) print with open("viz/" + run + "_15_60.json", "w") as f: json.dump(results, f) print run + ' done, ', time.time() - t0, 'seconds'
def main(): t0 = time.time() folder = 'text/' tweets = Tweets.load_from_folder(folder) print "Tweets loaded" duration = 30 results = [] for day in range(7,28): for hour in range(0,24): for minute in [0,15,30,45]: end = datetime.datetime(2014, 11, day, hour=hour, minute=minute) start = end - datetime.timedelta(seconds=60 * duration) data = tweets.get_collection(start, end) if len(data) == 0: break result_date = start.strftime('%Y-%m-%d %H:%M') + " - " + end.strftime('%Y-%m-%d %H:%M') result_clusters = cluster(data,5, []) results.append({"date": result_date, "clusters": result_clusters}) #dot() print end, len(data) print with open("viz/any_15_30.json", "w") as f: json.dump(results, f) print 'done, ', time.time() - t0, 'seconds'
#!/usr/bin/env python # import modules & set up logging import gensim import logging from tweets import Tweets # enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # set up tweet iterator tweet_dirs = [ '../data/labeled_data/positive/', '../data/labeled_data/negative/' ] tweets = Tweets(tweet_dirs) # iterator that returns preprocessed tweets # train word2vec on the tweets model = gensim.models.Word2Vec(tweets, iter=10, min_count=5, size=100) # save word2vec model model.save('./word2vec_model')
def create_tweet_cloud(self, tweets): word_string = Tweets.tweeted_words(tweets) return Graphs.tweeted_word_cloud(word_string, __class__)
def user_data_box(self, tweets, retweets, statuses): tweepy_data = User.get_user_data_tweepy(self) account_created = tweepy_data[0] total_followers = tweepy_data[1] total_friends = tweepy_data[2] total_statuses = tweepy_data[3] geo_status = tweepy_data[4] screen_name = tweepy_data[5] # num of user's tweets in db total_on_record = len(statuses) # percentage of user's total tweets in db coverage = round(((len(statuses) / total_statuses) * 100), 2) coverage = str(coverage) + '%' # Users top 10 most mentioned users user_mentions = list(Mentions.users_mentioned(tweets)[1]) user_mentions = sorted(user_mentions, key=lambda x: int(x[1]), reverse=True) user_mentions = user_mentions[:10] # top 10 hashtags used user_hashtags = Hashtags.get_user_hashtags(tweets)[0] top_hash = list(Hashtags.count_hashtags(user_hashtags)) top_hash = sorted(top_hash, key=lambda x: int(x[1]), reverse=True) top_hash = top_hash[:10] # Users top 10 most retweeted users users_retweeted = Retweets.get_retweeted_users(retweets) users_retweeted = users_retweeted[0] count_retweeted = list( set([(x, users_retweeted.count(x)) for x in users_retweeted])) fav_retweeted = sorted(count_retweeted, key=lambda x: int(x[1]), reverse=True) fav_retweeted = fav_retweeted[:10] first_on_record = statuses[0] most_recent_on_record = statuses[-1] # favourite tweet time fav_time = Tweets.tweets_per_minute(statuses) fav_time = sorted(fav_time, key=lambda x: int(x[1]), reverse=True) fav_time = fav_time[0] # day with most statuses sent busy_day = Tweets.tweets_per_date(statuses) busy_day = sorted(busy_day, key=lambda x: int(x[1]), reverse=True) busy_day = busy_day[0] # most used medium to tweet sources = [i[4] for i in statuses] fav_source = Sources.counted_sources(sources) fav_source = sorted(fav_source, key=lambda x: int(x[1]), reverse=True) fav_source = fav_source[0] return (screen_name, account_created, total_statuses, total_on_record, total_followers, total_friends, geo_status, first_on_record, most_recent_on_record, coverage, top_hash, fav_retweeted, user_mentions, fav_time, busy_day, fav_source)
def process_update(self, update, *args, **kwargs): update = Tweets.process_update(self, update) update.sort(key=lambda a: calendar.timegm(rfc822.parsedate(a['created_at']))) return update
def update_tweets(): t = Tweets() t.update() t.process()
import http.server import socketserver from http import HTTPStatus from urllib.parse import urlparse, parse_qs from urllib import parse from tweets import Tweets import json import os import files import time from http.server import HTTPServer from socketserver import ThreadingMixIn tweet = Tweets() # Fichier de test sur la concurrence et le sleep + timeout du serveur class Handler(http.server.SimpleHTTPRequestHandler): def do_OPTIONS(self): self.send_response(200, "ok") self.send_header('Access-Control-Allow-Origin', '*') self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS') self.send_header("Access-Control-Allow-Headers", "X-Requested-With") self.send_header("Access-Control-Allow-Headers", "Content-Type") self.end_headers() def do_GET(self): self.my_params = parse_qs(urlparse(self.path).query)
def main(arguments): # Parse optional filename arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-p', '--positive-tweets', dest='pos_dir', help="Directory of example positive tweets", default="../data/labeled_data/positive/") parser.add_argument('-n', '--negative-tweets', dest='neg_dir', help="Directory of example negative tweets", default="../data/labeled_data/negative/") parser.add_argument('-c', '--sample-count', dest='sample_count', help="Max number of samples of each sentiment", default="800000") args = parser.parse_args(arguments) # Create Tweets Iterators update("Creating tweet iterators...") pos_tweets_iter = Tweets([args.pos_dir]) neg_tweets_iter = Tweets([args.neg_dir]) update() # Save situtations to lists and shuffle update("Loading positive tweets...") pos_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter] update() update("Loading negative tweets...") neg_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter] update() update("Selecting balanced sample sets...") sample_count = int(args.sample_count) pos_tweets = resample(pos_tweets, n_samples=sample_count, replace=False, random_state=1) neg_tweets = resample(neg_tweets, n_samples=sample_count, replace=False, random_state=2) update() # Shuffle tweets and split into training, dev, and test update("Shuffle tweets and split into training, dev, and test sets...") pos_labels = [1 for _ in pos_tweets] neg_labels = [0 for _ in neg_tweets] tweets = np.append(pos_tweets, neg_tweets) labels = np.append(pos_labels, neg_labels) tweets, labels = shuffle(tweets, labels, random_state=2) size = len(labels) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) update() print() # Build Pipeline print("Performing grid search...") pipeline = Pipeline([('vect', CountVectorizer()), #('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) parameters = { #TODO check which parameters actually effect use in sarcasm detection 'vect__tokenizer': [tokenizer], 'vect__stop_words': [None], 'vect__binary': [False], 'vect__ngram_range': [(1,5)], #'tfidf__norm': [None, 'l1', 'l2'], #'tfidf__use_idf': [True, False], #'tfidf__smooth_idf': [True, False], #'tfidf__sublinear_tf': [True, False], 'clf__alpha': [1.0], # check range, these are guesses 'clf__fit_prior': [False], # not sure what the distribution in sarcasm data is } clf_pipe = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() clf_pipe.fit(tweets[train], labels[train]) print("Done in %0.3fs" % (time() - t0)) print() # Print grid search results print("Best score: %0.3f" % clf_pipe.best_score_) print("Best parameters set:") best_parameters = clf_pipe.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print() # Evaluate classifier vect = clf_pipe.best_estimator_.named_steps['vect'] #tfidf = clf_pipe.best_estimator_.named_steps['tfidf'] clf = clf_pipe.best_estimator_.named_steps['clf'] predicted = clf_pipe.predict(tweets[test]) print("Classifier Evaluation:") print(metrics.classification_report(labels[test], predicted, target_names=["-", "+"])) # save classifier pickle.dump(clf_pipe, open(MODEL_FNAME, 'wb'))
def get_friends_tweets(): return Tweets.get_all_tweets()
from tweets import Tweets from db import DB tweets = Tweets() tweets.getUserID() # test run to clean tweets of 3 users # tweets.textClean(tweets.userIDs[0:3]) # clean the tweets of all the users # NOTE # Remember to check the 'breakpoint' setting in config.py # to ensure the operation is a continue clean process on previous one # or an overwrite clean process all over again tweets.textClean()
def main(arguments): # enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', filename=LOG_FNAME, level=logging.INFO) # parse optional filename arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-s', '--sartic-tweets', dest='sar_dir', help="Directory of example sartic tweets", default="../data/labeled_data/sarcastic/") parser.add_argument('-p', '--positive-tweets', dest='pos_dir', help="Directory of example positive tweets", default="../data/labeled_data/positive/") parser.add_argument('-n', '--negative-tweets', dest='neg_dir', help="Directory of example negative tweets", default="../data/labeled_data/negative/") parser.add_argument( '-c', '--sample-count', dest='sample_count', help="Max number of samples of each class", default="10000") # 10k default, ~300k max with current data args = parser.parse_args(arguments) # set random seed np.random.seed(RAND_SEED) # create tweets iterators log_print("Creating tweet iterators...") sar_tweets_iter = Tweets([args.sar_dir]) pos_tweets_iter = Tweets([args.pos_dir]) neg_tweets_iter = Tweets([args.neg_dir]) log_print() # load tweets with gold labels filtered to lists and shuffle log_print("Loading sarcastic tweets with gold labels filtered...") sar_tweets = [Tweets.filter_tags(tweet) for tweet in sar_tweets_iter] log_print("...loaded {} sarcastic tweets".format(len(sar_tweets))) log_print("Loading non-sarcastic tweets...") pos_tweets = [Tweets.filter_tags(tweet) for tweet in pos_tweets_iter] # filter gold label hashtags log_print("...loaded {} positive tweets...".format(len(pos_tweets))) neg_tweets = [Tweets.filter_tags(tweet) for tweet in neg_tweets_iter] log_print("...loaded {} negative tweets".format(len(neg_tweets))) log_print( "Selecting balanced sample sets of {} tweets per class...".format( args.sample_count)) sample_count = int(args.sample_count) sar_tweets = resample(sar_tweets, n_samples=sample_count, replace=False, random_state=1) pos_tweets = resample(pos_tweets, n_samples=sample_count // 2, replace=False, random_state=2) neg_tweets = resample(neg_tweets, n_samples=sample_count // 2, replace=False, random_state=3) non_tweets = pos_tweets + neg_tweets log_print() # shuffle tweets and split into training, dev, and test log_print("Shuffle all tweets...") sar_labels = [1 for _ in sar_tweets] non_labels = [0 for _ in non_tweets] tweets = np.append(sar_tweets, non_tweets) labels = np.append(sar_labels, non_labels) tweets, labels = shuffle(tweets, labels, random_state=4) log_print() # write to output file log_print("write to files as training, dev, and test sets...") output_gen = (n for n in zip(tweets, labels) ) # generator of (tweet, label) tuples with open(OUTFNAME_FORMAT.format("test"), "w+") as f: for tweet, label in itertools.islice(output_gen, sample_count // 10): f.write("{}\t{}\n".format(label, ' '.join(tweet))) with open(OUTFNAME_FORMAT.format("dev"), "w+") as f: for tweet, label in itertools.islice(output_gen, sample_count // 10): f.write("{}\t{}\n".format(label, ' '.join(tweet))) with open(OUTFNAME_FORMAT.format("train"), "w+") as f: for tweet, label in output_gen: f.write("{}\t{}\n".format(label, ' '.join(tweet))) log_print( "...training, dev, and test sets written to files {}, {}, and {}". format(OUTFNAME_FORMAT.format("train"), OUTFNAME_FORMAT.format("dev"), OUTFNAME_FORMAT.format("test")))
def user_tweets(self): all_tweets = Tweets.get_all_tweets() all_tweets = [x for x in all_tweets if x[5] == self.user] return all_tweets
def _classifyUser_onethread(self, forbid, auto_hash, requested, sporty, classifiers, users_dir, uids, label_names, probability, i, stdout_lock, raw=False): def print_results(t): stdout_lock.acquire() uid = t[0] scores = t[1] if raw: print json.dumps(scores) else: print "%s,%s" % (uid, ",".join(map(str,scores))) sys.stdout.flush() stdout_lock.release() while True: uid = uids.get() if uid is None: logger.debug("%d - Exiting" % i) return logger.debug("%d - Processing %s" % (i, uid)) utweets = Tweets(os.path.join(users_dir, str(uid))) # removing sport tracker tweets no_sport_utweets = utweets.filter_on_hashtags(forbid, 'remove') if no_sport_utweets.size() < utweets.size(): # some sporty tweets have been removed if not sporty: # user is not supposed to be exercising logger.info("no_sport user %s is exercising" % uid) continue # removing tweets generated by well-known apps filtered_utweets = no_sport_utweets.filter_on_hashtags(auto_hash, 'remove') score_denom = filtered_utweets.size() poms_tweets = filtered_utweets.tolist() if requested: poms_tweets = filtered_utweets.filter_on_text(requested, 'keep').tolist() if not poms_tweets: logger.info("no tweets for %s" % uid) continue else: user = poms_tweets[0]['user'] if user['lang'] != 'en': logger.info("user %s lang is not en" % uid) continue X = self.buildX(poms_tweets, predict=True) preds = [] for label in label_names: # raw predicted probability scores if raw: preds.append(classifiers[label].predict_proba(X).tolist()) # classification else: y_pred_proba = classifiers[label].predict_proba(X)[:, 1] pred = map(lambda x: 0 if x < probability else 1, y_pred_proba) ones = float(np.count_nonzero(pred)) score = ones/score_denom preds.append(score) print_results((uid,preds))
def main(pos_dir, neg_dir, sar_dir, random_seed): np.random.seed(random_seed) # Create tweets iterators update("Creating tweet iterators...") pos_tweets_iter = Tweets([pos_dir]) neg_tweets_iter = Tweets([neg_dir]) sar_tweets_iter = Tweets([sar_dir]) update() # Save situtations to lists and shuffle update("Loading positive tweets...") pos_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter ] pos_tweets = shuffle(pos_tweets) update() update("Loading negative tweets...") neg_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter ] neg_tweets = shuffle(neg_tweets) update() update("Loading sarcastic tweets...") sar_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in sar_tweets_iter ] sar_tweets = shuffle(sar_tweets) update() # Save sarcasm data update("Saving sarcasm data...") count = len(sar_tweets) print("len pos_tweets before take = {}".format(len(pos_tweets))) non_sar_tweets = take(pos_tweets, count // 2) + take( neg_tweets, count // 2) print("len pos_tweets after take = {}".format(len(pos_tweets))) sar_labels = [1 for _ in sar_tweets] non_sar_labels = [0 for _ in non_sar_tweets] sarcasm_data = np.append(sar_tweets, non_sar_tweets) sarcasm_labels = np.append(sar_labels, non_sar_labels) sarcasm_data, sarcasm_labels = shuffle(sarcasm_data, sarcasm_labels) size = len(sarcasm_data) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) sarcasm_dump = { "train": (sarcasm_data[train], sarcasm_labels[train]), "dev": (sarcasm_data[dev], sarcasm_labels[dev]), "test": (sarcasm_data[test], sarcasm_labels[test]) } pickle.dump(sarcasm_dump, open(os.path.join(SPLIT_DATA_DIR, "sarcasm.pkl"), 'wb')) update() # Save sentiment data update("Saving sentiment data...") count = min(len(pos_tweets), len(neg_tweets)) pos_tweets = pos_tweets[:count] neg_tweets = neg_tweets[:count] pos_labels = [1 for _ in pos_tweets] neg_labels = [0 for _ in neg_tweets] sentiment_data = np.append(pos_tweets, neg_tweets) sentiment_labels = np.append(pos_labels, neg_labels) sentiment_data, sentiment_labels = shuffle(sentiment_data, sentiment_labels) size = len(sentiment_data) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) sentiment_dump = { "train": (sentiment_data[train], sentiment_labels[train]), "dev": (sentiment_data[dev], sentiment_labels[dev]), "test": (sentiment_data[test], sentiment_labels[test]) } pickle.dump(sentiment_dump, open(os.path.join(SPLIT_DATA_DIR, "sentiment.pkl"), 'wb')) update()
def process_update(self, update, *args, **kwargs): update = Tweets.process_update(self, update) update.sort( key=lambda a: calendar.timegm(rfc822.parsedate(a['created_at']))) return update
def query_tweets(request): """ Returns tweet query """ query_count = 10000 # int(request.GET.get("embedCount", TWEET_QUERY_COUNT)) export = request.GET.get("export", None) query = request.GET.get("query", "") tweets = Tweets(query=query, query_count=query_count, request=request) response_data = {} if export == "ta": output = StringIO.StringIO() for t in tweets.get_data(): user_id = t['actor']['id'] output.write(user_id + '\n') ton_request = ton.TwitterTon( twitter_consumer_key=settings.SOCIAL_AUTH_TWITTER_KEY, twitter_consumer_secret=settings.SOCIAL_AUTH_TWITTER_SECRET, access_token=settings.TWITTER_ACCESS_TOKEN, access_token_secret=settings.TWITTER_ACCESS_TOKEN_SECRET) bytes = output.getvalue() ton_response = ton_request.upload_data( payload=bytes.encode('utf-16be')) output.close() location = ton_response['location'] response = HttpResponse(json.dumps( {"location": location, "query": query}), content_type="application/json") return response elif export == "csv": response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="export.csv"' writer = csv.writer(response) writer.writerow(['count', 'time', 'id', 'user_screen_name', 'user_id', 'status', 'retweet_count', 'favorite_count', 'is_retweet', 'in_reply_to_tweet_id', 'in_reply_to_screen_name']) count = 0 for t in tweets.get_data(): count = count + 1 body = t['body'].encode('ascii', 'replace') status_id = t['id'] status_id = status_id[status_id.rfind(':') + 1:] user_id = t['actor']['id'] user_id = user_id[user_id.rfind(':') + 1:] writer.writerow([count, t['postedTime'], status_id, t['actor']['preferredUsername'], user_id, body, t['retweetCount'], t['favoritesCount'], 'X', 'X', 'X']) return response else: response_data['tweets'] = tweets.get_data() response = HttpResponse( json.dumps(response_data), content_type="application/json") response['Cache-Control'] = 'max-age=%d' % MAX_AGE return response
def update_tweets(ticker): return Tweets(ticker)
def __init__(self, class1, class2): # store the sets of tweets making up each bit of the training set self.class1 = Tweets(class1) self.class2 = Tweets(class2)
def create_aio_plot(tweet_list): graph_coords = Tweets.tweets_per_minute(tweet_list) graph_coords.sort(key=lambda x: int(x[1]), reverse=True) return Graphs.all_in_one(graph_coords, __class__)
def main(arguments): # Parse optional filename arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-p', '--positive-tweets', dest='pos_dir', help="Directory of example positive tweets", default="../../data/labeled_data/positive/") parser.add_argument('-n', '--negative-tweets', dest='neg_dir', help="Directory of example negative tweets", default="../../data/labeled_data/negative/") parser.add_argument('-c', '--sample-count', dest='sample_count', help="Max number of samples of each sentiment", default="10") args = parser.parse_args(arguments) # Create Tweets Iterators update("Creating tweet iterators...") pos_tweets_iter = Tweets([args.pos_dir]) neg_tweets_iter = Tweets([args.neg_dir]) update() # Save situtations to lists and shuffle update("Loading positive tweets...") pos_tweets = [' '.join(tweet) for tweet in pos_tweets_iter] update() update("Loading negative tweets...") neg_tweets = [' '.join(tweet) for tweet in neg_tweets_iter] update() update("Selecting balanced sample sets...") sample_count = int(args.sample_count) pos_tweets = resample(pos_tweets, n_samples=sample_count, replace=False, random_state=1) neg_tweets = resample(neg_tweets, n_samples=sample_count, replace=False, random_state=2) update() # Shuffle tweets and split into training, dev, and test update("Shuffle tweets and split into training, dev, and test sets...") pos_labels = [1 for _ in pos_tweets] neg_labels = [0 for _ in neg_tweets] tweets = np.append(pos_tweets, neg_tweets) labels = np.append(pos_labels, neg_labels) tweets, labels = shuffle(tweets, labels, random_state=2) size = len(labels) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) update() print() clf_pipe = pickle.load(open(MODEL_FNAME, 'rb')) # Evaluate classifier vect = clf_pipe.best_estimator_.named_steps['vect'] clf = clf_pipe.best_estimator_.named_steps['clf'] predicted = clf_pipe.predict(tweets[test]) print("Classifier Evaluation:") print( metrics.classification_report(labels[test], predicted, target_names=["-", "+"]))
from tweets import Tweets tweet = Tweets() print("Nombre de tweets dans tout le dataframe: ", tweet.get_number_tweets()) print('Tweets pourcentage per country: ', tweet.get_number_tweets_countries()) print("Dictionnaire pour la France: ", tweet.get_number_tweets_country("France", ""))