def main(): # csv = pd.read_csv('bernie_tweets.csv') # csv.replace(r'"', '', inplace=True, regex=True) # print(csv) tweets = Tweets(pd.read_csv('bernie_tweets2.csv', sep='^')).clean() tweets.to_csv('cleaned_bernie_tweets.csv', index=False)
def posting_most_similar_words_on_Twitter(): # インスタンスの生成 tw = Tweets() maam = MorphologicalAnalysisAndModeling() # 検索ワードかつ類似度判定の基準となる単語を入力する search_word = input("検索ワードを入力してください >>> ") # search wordでTwitterを検索する。200ツイートを取得しリストtweet listにまとめる。 tweet_list = tw.get_tweet_by_search(search_word=search_word, count=200) # MeCabで上記の200ツイートtweet listの形態素解析を行い、結果をリストresultsに出力する。 results = maam.mecab(tweet_list) # Word2vecで形態素解析の結果をモデル化 model = maam.word2vec(results) # search wordと類似した言葉を出力し、体裁を整えて文字列にする。 words = '' for i in model.wv.most_similar(positive=[search_word]): words += str(i)[1:-20] words = words.replace("'", "") text = '{}と近い言葉は、{}'.format(search_word, words) # 上記の文字列をTwitterに投稿する。 tw.posting_on_twitter(text=text)
def initialize(self): self.emit('hello', {'msg': 'alright!'}) session = self.environ.get('beaker.session') if not session.has_key('access_token'): self.emit('failed_stream') return access_token = session['access_token'] self.tweets = Tweets(consumer_token, access_token)
def archive(self): # check and create directory if not path.exists(self.media_dir): mkdir(self.media_dir) if User(self.username, self.api).archive(): Tweets(self.username, self.api).archive() Media(self.username, self.api).archive()
def main(): tweets = Tweets() tweets.authentication() #tweets.getAllTweetsBySearch("Microsoft", 200) stockdata = StockData() #print(stockdata.getHistoricalDataByID('MSFT', "2018-01-01", "2018-02-02")) model = Model(tweets) model.getInputData('MSFT')
def run(self): logging.info("analyzer started") tweets = Tweets() while True: self.find_new_zh_user(tweets) self.find_active_zh_user(tweets) logging.info("sleep a while") time.sleep(30)
def get_text(file_path): consumer_key = "" consumer_secret = "" access_key = "" access_secret = "" twitter = Tweets(consumer_key, consumer_secret, access_key, access_secret) hashtags = set() handles = [] with open(file_path) as f: header = True for line in f: if header: header = False continue data = line.split(",") handles.append(data[:2]) count = 1 with open("../resources/tweets.csv", 'a') as f: for handle in handles: f.write(handle[0]) f.write(",") f.write(handle[1]) f.write(",") f.write(str(count)) f.write("\n") tweets = twitter.get_all_tweets(handle[0]) f.write(str(len(tweets))) f.write("\n") for tweet in tweets: if hasattr(tweet, 'retweeted_status'): text = tweet.retweeted_status.full_text else: text = tweet.full_text text = re.sub(r"http\S+", "", text.replace('\n', ' ')) text = text.replace(',', '') text = text.strip() if text: f.write(text) else: f.write("empty") f.write("\n") print(f"Got tweets for {count}") count += 1
def query_tweets(request): """ Returns tweet query """ request_timeframe = Timeframe(start=request.GET.get("start", None), end=request.GET.get("end", None), interval=request.GET.get("interval", "hour")) query_count = int(request.GET.get("embedCount", TWEET_QUERY_COUNT)) export = request.GET.get("export", None) query = request.GET.get("query", "") try: tweets = Tweets(query=query, query_count=query_count, start=request_timeframe.start, end=request_timeframe.end, export=export) except GNIPQueryError as e: return handleQueryError(e) response_data = {} if export == "csv": response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="export.csv"' writer = csv.writer(response, delimiter=',', quotechar="'", quoting=csv.QUOTE_ALL) writer.writerow([ 'count', 'time', 'id', 'user_screen_name', 'user_id', 'status', 'retweet_count', 'favorite_count', 'is_retweet', 'in_reply_to_tweet_id', 'in_reply_to_screen_name' ]) count = 0 for t in tweets.get_data(): count = count + 1 body = t['body'].encode('ascii', 'replace') status_id = t['id'] status_id = status_id[status_id.rfind(':') + 1:] user_id = t['actor']['id'] user_id = user_id[user_id.rfind(':') + 1:] writer.writerow([ count, t['postedTime'], status_id, t['actor']['preferredUsername'], user_id, body, t['retweetCount'], t['favoritesCount'], 'X', 'X', 'X' ]) return response else: response_data['tweets'] = tweets.get_data() return HttpResponse(json.dumps(response_data), content_type="application/json")
def search(): keyword = request.form['keyword'] wordList = re.sub("[^\w]", " ", keyword).split() # will strip punctuations later tw = Tweets() for i in wordList: tw.clean(i) #return to the template return render_template("index.html")
def run(self): tweets = Tweets() while True: if not self.queue.size(): logging.warning("queue is empty") time.sleep(CRAWLER_COLDDOWN_TIME) continue user_id = self.queue.pop() logging.info("fetching user %s." % user_id) """ print (self.name, user_id) pull tweet, user's follower & friends push em to db. """ self._push_to_db(tweets.get_user_timeline(user_id, count=50), "tweets") self._push_to_db(tweets.get_user_list(user_id), "users") self._push_to_db(tweets.get_user_list(user_id, url=TWITTER_FRIENDS_LIST), "users") time.sleep(CRAWLER_COLDDOWN_TIME) self.queue.put(user_id)
def query_tweets(request): """ Returns tweet query """ query_count = 10000 # int(request.GET.get("embedCount", TWEET_QUERY_COUNT)) export = request.GET.get("export", None) query = request.GET.get("query", "") tweets = Tweets(query=query, query_count=query_count, request=request) response_data = {} if export == "ta": output = StringIO.StringIO() for t in tweets.get_data(): user_id = t['actor']['id'] output.write(user_id + '\n') ton_request = ton.TwitterTon( twitter_consumer_key=settings.SOCIAL_AUTH_TWITTER_KEY, twitter_consumer_secret=settings.SOCIAL_AUTH_TWITTER_SECRET, access_token=settings.TWITTER_ACCESS_TOKEN, access_token_secret=settings.TWITTER_ACCESS_TOKEN_SECRET) bytes = output.getvalue() ton_response = ton_request.upload_data( payload=bytes.encode('utf-16be')) output.close() location = ton_response['location'] response = HttpResponse(json.dumps( {"location": location, "query": query}), content_type="application/json") return response elif export == "csv": response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="export.csv"' writer = csv.writer(response) writer.writerow(['count', 'time', 'id', 'user_screen_name', 'user_id', 'status', 'retweet_count', 'favorite_count', 'is_retweet', 'in_reply_to_tweet_id', 'in_reply_to_screen_name']) count = 0 for t in tweets.get_data(): count = count + 1 body = t['body'].encode('ascii', 'replace') status_id = t['id'] status_id = status_id[status_id.rfind(':') + 1:] user_id = t['actor']['id'] user_id = user_id[user_id.rfind(':') + 1:] writer.writerow([count, t['postedTime'], status_id, t['actor']['preferredUsername'], user_id, body, t['retweetCount'], t['favoritesCount'], 'X', 'X', 'X']) return response else: response_data['tweets'] = tweets.get_data() response = HttpResponse( json.dumps(response_data), content_type="application/json") response['Cache-Control'] = 'max-age=%d' % MAX_AGE return response
def update_tweets(): t = Tweets() t.update() t.process()
import http.server import socketserver from http import HTTPStatus from urllib.parse import urlparse, parse_qs from urllib import parse from tweets import Tweets import json import os import files import time from http.server import HTTPServer from socketserver import ThreadingMixIn tweet = Tweets() # Fichier de test sur la concurrence et le sleep + timeout du serveur class Handler(http.server.SimpleHTTPRequestHandler): def do_OPTIONS(self): self.send_response(200, "ok") self.send_header('Access-Control-Allow-Origin', '*') self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS') self.send_header("Access-Control-Allow-Headers", "X-Requested-With") self.send_header("Access-Control-Allow-Headers", "Content-Type") self.end_headers() def do_GET(self): self.my_params = parse_qs(urlparse(self.path).query)
from tweets import Tweets from db import DB tweets = Tweets() tweets.getUserID() # test run to clean tweets of 3 users # tweets.textClean(tweets.userIDs[0:3]) # clean the tweets of all the users # NOTE # Remember to check the 'breakpoint' setting in config.py # to ensure the operation is a continue clean process on previous one # or an overwrite clean process all over again tweets.textClean()
#!/usr/bin/env python # import modules & set up logging import gensim import logging from tweets import Tweets # enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # set up tweet iterator tweet_dirs = [ '../data/labeled_data/positive/', '../data/labeled_data/negative/' ] tweets = Tweets(tweet_dirs) # iterator that returns preprocessed tweets # train word2vec on the tweets model = gensim.models.Word2Vec(tweets, iter=10, min_count=5, size=100) # save word2vec model model.save('./word2vec_model')
def main(pos_dir, neg_dir, sar_dir, random_seed): np.random.seed(random_seed) # Create tweets iterators update("Creating tweet iterators...") pos_tweets_iter = Tweets([pos_dir]) neg_tweets_iter = Tweets([neg_dir]) sar_tweets_iter = Tweets([sar_dir]) update() # Save situtations to lists and shuffle update("Loading positive tweets...") pos_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter ] pos_tweets = shuffle(pos_tweets) update() update("Loading negative tweets...") neg_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter ] neg_tweets = shuffle(neg_tweets) update() update("Loading sarcastic tweets...") sar_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in sar_tweets_iter ] sar_tweets = shuffle(sar_tweets) update() # Save sarcasm data update("Saving sarcasm data...") count = len(sar_tweets) print("len pos_tweets before take = {}".format(len(pos_tweets))) non_sar_tweets = take(pos_tweets, count // 2) + take( neg_tweets, count // 2) print("len pos_tweets after take = {}".format(len(pos_tweets))) sar_labels = [1 for _ in sar_tweets] non_sar_labels = [0 for _ in non_sar_tweets] sarcasm_data = np.append(sar_tweets, non_sar_tweets) sarcasm_labels = np.append(sar_labels, non_sar_labels) sarcasm_data, sarcasm_labels = shuffle(sarcasm_data, sarcasm_labels) size = len(sarcasm_data) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) sarcasm_dump = { "train": (sarcasm_data[train], sarcasm_labels[train]), "dev": (sarcasm_data[dev], sarcasm_labels[dev]), "test": (sarcasm_data[test], sarcasm_labels[test]) } pickle.dump(sarcasm_dump, open(os.path.join(SPLIT_DATA_DIR, "sarcasm.pkl"), 'wb')) update() # Save sentiment data update("Saving sentiment data...") count = min(len(pos_tweets), len(neg_tweets)) pos_tweets = pos_tweets[:count] neg_tweets = neg_tweets[:count] pos_labels = [1 for _ in pos_tweets] neg_labels = [0 for _ in neg_tweets] sentiment_data = np.append(pos_tweets, neg_tweets) sentiment_labels = np.append(pos_labels, neg_labels) sentiment_data, sentiment_labels = shuffle(sentiment_data, sentiment_labels) size = len(sentiment_data) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) sentiment_dump = { "train": (sentiment_data[train], sentiment_labels[train]), "dev": (sentiment_data[dev], sentiment_labels[dev]), "test": (sentiment_data[test], sentiment_labels[test]) } pickle.dump(sentiment_dump, open(os.path.join(SPLIT_DATA_DIR, "sentiment.pkl"), 'wb')) update()
def main(arguments): # enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', filename=LOG_FNAME, level=logging.INFO) # parse optional filename arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-s', '--sartic-tweets', dest='sar_dir', help="Directory of example sartic tweets", default="../data/labeled_data/sarcastic/") parser.add_argument('-p', '--positive-tweets', dest='pos_dir', help="Directory of example positive tweets", default="../data/labeled_data/positive/") parser.add_argument('-n', '--negative-tweets', dest='neg_dir', help="Directory of example negative tweets", default="../data/labeled_data/negative/") parser.add_argument( '-c', '--sample-count', dest='sample_count', help="Max number of samples of each class", default="10000") # 10k default, ~300k max with current data args = parser.parse_args(arguments) # set random seed np.random.seed(RAND_SEED) # create tweets iterators log_print("Creating tweet iterators...") sar_tweets_iter = Tweets([args.sar_dir]) pos_tweets_iter = Tweets([args.pos_dir]) neg_tweets_iter = Tweets([args.neg_dir]) log_print() # load tweets with gold labels filtered to lists and shuffle log_print("Loading sarcastic tweets with gold labels filtered...") sar_tweets = [Tweets.filter_tags(tweet) for tweet in sar_tweets_iter] log_print("...loaded {} sarcastic tweets".format(len(sar_tweets))) log_print("Loading non-sarcastic tweets...") pos_tweets = [Tweets.filter_tags(tweet) for tweet in pos_tweets_iter] # filter gold label hashtags log_print("...loaded {} positive tweets...".format(len(pos_tweets))) neg_tweets = [Tweets.filter_tags(tweet) for tweet in neg_tweets_iter] log_print("...loaded {} negative tweets".format(len(neg_tweets))) log_print( "Selecting balanced sample sets of {} tweets per class...".format( args.sample_count)) sample_count = int(args.sample_count) sar_tweets = resample(sar_tweets, n_samples=sample_count, replace=False, random_state=1) pos_tweets = resample(pos_tweets, n_samples=sample_count // 2, replace=False, random_state=2) neg_tweets = resample(neg_tweets, n_samples=sample_count // 2, replace=False, random_state=3) non_tweets = pos_tweets + neg_tweets log_print() # shuffle tweets and split into training, dev, and test log_print("Shuffle all tweets...") sar_labels = [1 for _ in sar_tweets] non_labels = [0 for _ in non_tweets] tweets = np.append(sar_tweets, non_tweets) labels = np.append(sar_labels, non_labels) tweets, labels = shuffle(tweets, labels, random_state=4) log_print() # write to output file log_print("write to files as training, dev, and test sets...") output_gen = (n for n in zip(tweets, labels) ) # generator of (tweet, label) tuples with open(OUTFNAME_FORMAT.format("test"), "w+") as f: for tweet, label in itertools.islice(output_gen, sample_count // 10): f.write("{}\t{}\n".format(label, ' '.join(tweet))) with open(OUTFNAME_FORMAT.format("dev"), "w+") as f: for tweet, label in itertools.islice(output_gen, sample_count // 10): f.write("{}\t{}\n".format(label, ' '.join(tweet))) with open(OUTFNAME_FORMAT.format("train"), "w+") as f: for tweet, label in output_gen: f.write("{}\t{}\n".format(label, ' '.join(tweet))) log_print( "...training, dev, and test sets written to files {}, {}, and {}". format(OUTFNAME_FORMAT.format("train"), OUTFNAME_FORMAT.format("dev"), OUTFNAME_FORMAT.format("test")))
def main(arguments): # Parse optional filename arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-p', '--positive-tweets', dest='pos_dir', help="Directory of example positive tweets", default="../data/labeled_data/positive/") parser.add_argument('-n', '--negative-tweets', dest='neg_dir', help="Directory of example negative tweets", default="../data/labeled_data/negative/") parser.add_argument('-c', '--sample-count', dest='sample_count', help="Max number of samples of each sentiment", default="800000") args = parser.parse_args(arguments) # Create Tweets Iterators update("Creating tweet iterators...") pos_tweets_iter = Tweets([args.pos_dir]) neg_tweets_iter = Tweets([args.neg_dir]) update() # Save situtations to lists and shuffle update("Loading positive tweets...") pos_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter] update() update("Loading negative tweets...") neg_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter] update() update("Selecting balanced sample sets...") sample_count = int(args.sample_count) pos_tweets = resample(pos_tweets, n_samples=sample_count, replace=False, random_state=1) neg_tweets = resample(neg_tweets, n_samples=sample_count, replace=False, random_state=2) update() # Shuffle tweets and split into training, dev, and test update("Shuffle tweets and split into training, dev, and test sets...") pos_labels = [1 for _ in pos_tweets] neg_labels = [0 for _ in neg_tweets] tweets = np.append(pos_tweets, neg_tweets) labels = np.append(pos_labels, neg_labels) tweets, labels = shuffle(tweets, labels, random_state=2) size = len(labels) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) update() print() # Build Pipeline print("Performing grid search...") pipeline = Pipeline([('vect', CountVectorizer()), #('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) parameters = { #TODO check which parameters actually effect use in sarcasm detection 'vect__tokenizer': [tokenizer], 'vect__stop_words': [None], 'vect__binary': [False], 'vect__ngram_range': [(1,5)], #'tfidf__norm': [None, 'l1', 'l2'], #'tfidf__use_idf': [True, False], #'tfidf__smooth_idf': [True, False], #'tfidf__sublinear_tf': [True, False], 'clf__alpha': [1.0], # check range, these are guesses 'clf__fit_prior': [False], # not sure what the distribution in sarcasm data is } clf_pipe = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() clf_pipe.fit(tweets[train], labels[train]) print("Done in %0.3fs" % (time() - t0)) print() # Print grid search results print("Best score: %0.3f" % clf_pipe.best_score_) print("Best parameters set:") best_parameters = clf_pipe.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print() # Evaluate classifier vect = clf_pipe.best_estimator_.named_steps['vect'] #tfidf = clf_pipe.best_estimator_.named_steps['tfidf'] clf = clf_pipe.best_estimator_.named_steps['clf'] predicted = clf_pipe.predict(tweets[test]) print("Classifier Evaluation:") print(metrics.classification_report(labels[test], predicted, target_names=["-", "+"])) # save classifier pickle.dump(clf_pipe, open(MODEL_FNAME, 'wb'))
def main(arguments): # Parse optional filename arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-p', '--positive-tweets', dest='pos_dir', help="Directory of example positive tweets", default="../../data/labeled_data/positive/") parser.add_argument('-n', '--negative-tweets', dest='neg_dir', help="Directory of example negative tweets", default="../../data/labeled_data/negative/") parser.add_argument('-c', '--sample-count', dest='sample_count', help="Max number of samples of each sentiment", default="10") args = parser.parse_args(arguments) # Create Tweets Iterators update("Creating tweet iterators...") pos_tweets_iter = Tweets([args.pos_dir]) neg_tweets_iter = Tweets([args.neg_dir]) update() # Save situtations to lists and shuffle update("Loading positive tweets...") pos_tweets = [' '.join(tweet) for tweet in pos_tweets_iter] update() update("Loading negative tweets...") neg_tweets = [' '.join(tweet) for tweet in neg_tweets_iter] update() update("Selecting balanced sample sets...") sample_count = int(args.sample_count) pos_tweets = resample(pos_tweets, n_samples=sample_count, replace=False, random_state=1) neg_tweets = resample(neg_tweets, n_samples=sample_count, replace=False, random_state=2) update() # Shuffle tweets and split into training, dev, and test update("Shuffle tweets and split into training, dev, and test sets...") pos_labels = [1 for _ in pos_tweets] neg_labels = [0 for _ in neg_tweets] tweets = np.append(pos_tweets, neg_tweets) labels = np.append(pos_labels, neg_labels) tweets, labels = shuffle(tweets, labels, random_state=2) size = len(labels) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) update() print() clf_pipe = pickle.load(open(MODEL_FNAME, 'rb')) # Evaluate classifier vect = clf_pipe.best_estimator_.named_steps['vect'] clf = clf_pipe.best_estimator_.named_steps['clf'] predicted = clf_pipe.predict(tweets[test]) print("Classifier Evaluation:") print( metrics.classification_report(labels[test], predicted, target_names=["-", "+"]))
def update_tweets(ticker): return Tweets(ticker)