Ejemplo n.º 1
0
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
from secrets import *

TRACK = 'teaching'

twitter_auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                                   CONSUMER_KEY, CONSUMER_SECRET)

twitter_stream = twitter.TwitterStream(auth=twitter_auth)

statuses = twitter_stream.statuses.filter(track=TRACK)

for t in statuses:
    print(t['text'])
    try:
        u = db_session.query(User).filter_by(uid=str(t['user']['id'])).one()
    except NoResultFound:
        u = User(screen_name=t['user']['screen_name'], uid=t['user']['id'])
        db_session.add(u)
        db_session.commit()

    tw = Tweet(tweet=t['text'],
               tid=t['id'],
               user_id=u.id,
               created_at=t['created_at'],
               data=json.dumps(t))

    try:
        words = tw.tweet.split()
        for w in words:
            try:
Ejemplo n.º 2
0
#find out who is a bot, and who is not

from tweetsql.model import User
from tweetsql.database import db_session
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Firefox()
driver.get("http://truthy.indiana.edu/botornot/")
elem = driver.find_element_by_class_name("form-control")

all_users = db_session.query(User).all()
button = driver.find_element_by_id('btnGetTimeline')

for user in all_users:
	screen_name = user.screen_name.encode('ascii', 'ignore')
	elem.send_keys(screen_name)
	button.click()
	# try:
	# 	print 'waiting'
	# 	element = WebDriverWait(driver, 30).until(
	# 		EC.presence_of_element_located((By.ID, 'span-power-readout'))
	# 	)
	# finally:
	# 	driver.quit()
	# driver.implicitly_wait(10)
	time.sleep(10)
	print 'waiting some more? why...who knows?'
Ejemplo n.º 3
0
        g.node[n]['weight'] = 1
        g.node[n]['type'] = t
 
def graph_add_edge(n1, n2, g):
    if g.has_edge(n1, n2):
        g[n1][n2]['weight']+=1
    else:
        g.add_edge(n1,n2)
        g[n1][n2]['weight']=1

graph = nx.Graph()


iterate through every tweet, storing each tweet in t

print len(db_session.query(Friend.user_id, func.count(Friend.friend_id)).group_by(Friend.user_id).all())


    add t to the graph
    graph_add_node(t.tweet, graph, 'tweet')
    # now iterate through all the words in t, storing each word in w
    for w in t.words:
        # looking for hashtags (there's a better way to do this)
        if w.word[0] == '#':
            graph_add_node(w.word, graph, 'hashtag')
            graph_add_edge(t.tweet, w.word, graph)

# you probably want to change this string to something meaningful too
q = 'example'

# because it ends up in the file name for the GEXF output file
Ejemplo n.º 4
0
from tweetsql.database import db_session
import pickle
from collections import Counter


def pickleIt(data, fname):
    output = open(fname, 'wb')
    pickle.dump(data, output)
    output.close()


try:
    f = open('user_tweet.p', 'rb')
    f.close()
except:
    tweets = db_session.query(User.screen_name, Tweet.id,
                              Tweet.tweet).join(Tweet.user).all()
    pickleIt(tweets, 'user_tweet.p')

try:
    f = open('user_hashtag.p', 'rb')
except:
    hashtags = db_session.query(User.screen_name,
                                Hashtag.hashtag).join(Hashtag.users).all()
    pickleIt(hashtags, 'user_hashtag.p')

try:
    f = open('all_user_friends.p', 'rb')
except:
    users = db_session.query(User.id, Tweet.id).join(Tweet.user).all()
    print 'users retrieved'
    tweet_counter = Counter()
Ejemplo n.º 5
0
#find out who is a bot, and who is not

from tweetsql.model import User
from tweetsql.database import db_session
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Firefox()
driver.get("http://truthy.indiana.edu/botornot/")
elem = driver.find_element_by_class_name("form-control")

all_users = db_session.query(User).all()
button = driver.find_element_by_id('btnGetTimeline')

for user in all_users:
    screen_name = user.screen_name.encode('ascii', 'ignore')
    elem.send_keys(screen_name)
    button.click()
    # try:
    # 	print 'waiting'
    # 	element = WebDriverWait(driver, 30).until(
    # 		EC.presence_of_element_located((By.ID, 'span-power-readout'))
    # 	)
    # finally:
    # 	driver.quit()
    # driver.implicitly_wait(10)
    time.sleep(10)
    print 'waiting some more? why...who knows?'
Ejemplo n.º 6
0
        g.node[n]['weight'] = 1
        g.node[n]['type'] = t


def graph_add_edge(n1, n2, g):
    if g.has_edge(n1, n2):
        g[n1][n2]['weight'] += 1
    else:
        g.add_edge(n1, n2)
        g[n1][n2]['weight'] = 1


graph = nx.Graph()

# iterate through every tweet, storing each tweet in t
for t in db_session.query(Tweet).all():
    # add t to the graph
    graph_add_node(t.tweet, graph, 'tweet')
    # now iterate through all the words in t, storing each word in w
    for w in t.words:
        # looking for hashtags (there's a better way to do this)
        if w.word[0] == '#':
            graph_add_node(w.word, graph, 'hashtag')
            graph_add_edge(t.tweet, w.word, graph)

# you probably want to change this string to something meaningful too
q = 'example'

# because it ends up in the file name for the GEXF output file
nx.write_gexf(graph, '{}_tweet_graph.gexf'.format(q))
print('{}_tweet_graph.gexf'.format(q))
Ejemplo n.º 7
0
#set up twitter api
clean_key = get_clean_key()
twitter_auth = twitter.oauth.OAuth(clean_key.OAUTH_TOKEN,
                                   clean_key.OAUTH_TOKEN_SECRET,
                                   clean_key.CONSUMER_KEY,
                                   clean_key.CONSUMER_SECRET)
api = twitter.Twitter(auth=twitter_auth)

if get_rate_limit(t=api, data='remaining') == 0:
    print 'rate limit hit'
    print 'try again at %d' % get_rate_limit(t=api, data='reset')
    sys.exit()

#let's get 15 screen names that are not in the Friend table
got_friends = db_session.query(distinct(
    Friend.user_id)).all()  #first get all users in the friend table
got_friends = [t[0] for t in got_friends]
dead_users = db_session.query(NoUser.user_id).all()
dead_users = [t[0] for t in dead_users]
all_users = db_session.query(User.id, User.uid).all()
all_users = [(pk, uid) for pk, uid in all_users
             if pk not in dead_users]  #filter out dead users
no_friends = [(pk, uid) for pk, uid in all_users
              if pk not in got_friends]  #get rid of people with friends

# rate_limit = 15
# requests = 0
print no_friends[:15]
for pk, uid in no_friends:
    cursor = -1
    friends = []
Ejemplo n.º 8
0
	for k in keys:
		twitter_auth = twitter.oauth.OAuth(k.OAUTH_TOKEN, k.OAUTH_TOKEN_SECRET, k.CONSUMER_KEY, k.CONSUMER_SECRET)
		api = twitter.Twitter(auth=twitter_auth)
		remaining = get_rate_limit(t=api)
		if remaining > maxRemaining:
			maxRemaining = remaining
			winner = k
	return winner

#set up twitter api
clean_key = get_clean_key()
twitter_auth = twitter.oauth.OAuth(clean_key.OAUTH_TOKEN, clean_key.OAUTH_TOKEN_SECRET, 
	clean_key.CONSUMER_KEY, clean_key.CONSUMER_SECRET)
api = twitter.Twitter(auth=twitter_auth)

got_data = db_session.query(UserData.id).all() #first get all users in the UseData table
got_data = [t[0] for t in got_data]

# all_users = db_session.query(User.id, User.uid).all()
all_users = db_session.query(Friend.id, Friend.friend_id).all()

no_data = [(pk,uid) for pk,uid in all_users if pk not in got_data]
#group no_data by 100 for our twitter call
no_data_100s = [no_data[i:i+99] for i in range(0,len(no_data),100)] 

if get_rate_limit(t=api)==0:
	print 'rate limit hit'
	print 'try again at %d' %get_rate_limit(t=api, data='reset')


for l in no_data_100s:
Ejemplo n.º 9
0
        g.add_node(n)
        g.node[n]['label'] = n
        g.node[n]['weight'] = 1
        g.node[n]['type'] = t
 
def graph_add_edge(n1, n2, g):
    if g.has_edge(n1, n2):
        g[n1][n2]['weight']+=1
    else:
        g.add_edge(n1,n2)
        g[n1][n2]['weight']=1

graph = nx.Graph()

# iterate through every tweet, storing each tweet in t
for t in db_session.query(Tweet).all():
    # add t to the graph
    graph_add_node(t.tweet, graph, 'tweet')
    # now iterate through all the words in t, storing each word in w
    for w in t.words:
        # looking for hashtags (there's a better way to do this)
        if w.word[0] == '#':
            graph_add_node(w.word, graph, 'hashtag')
            graph_add_edge(t.tweet, w.word, graph)

# you probably want to change this string to something meaningful too
q = 'example'

# because it ends up in the file name for the GEXF output file
nx.write_gexf(graph, '{}_tweet_graph.gexf'.format(q))
print('{}_tweet_graph.gexf'.format(q))
Ejemplo n.º 10
0
import json
import nltk 
nltk.data.path.append('./tweetEasy/nltk_data/') #this may need to change depending on when
from nltk.corpus import stopwords
from tweetEasy.tweetEasy import ParseStatus
from tweetsql.database import db_session
from tweetsql.model import Hashtag, Tweet, Word, User
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound

stop = stopwords.words('english')
tweets_users = db_session.query(Tweet, User).join(Tweet.user).all()

for t, u in tweets_users:
	data = json.loads(t.data)
	search = ParseStatus(data)
	if(len(t.hashtags)==0):
		hashtags = search.hashtags
		for h in hashtags:
			try:
				h_obj = db_session.query(Hashtag).filter(Hashtag.hashtag == h).one()			
			except MultipleResultsFound:
				pass
			except NoResultFound:	
				h_obj = Hashtag(hashtag=h)
				db_session.add(h_obj)
			except OperationalError:
			    print 'error'
			    db_session.rollback()
			#add the relationship between h_obj and tweets
			t.hashtags.append(h_obj)
Ejemplo n.º 11
0
import json
import nltk
from nltk.tokenize import wordpunct_tokenize
nltk.data.path.append('./tweetEasy/nltk_data/') #this may need to change depending on when
from nltk.corpus import stopwords
from tweetEasy.tweetEasy import ParseStatus
from tweetsql.database import db_session
from tweetsql.model import Hashtag, Tweet, Word, User
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound

stop = stopwords.words('english')
tweets = db_session.query(Tweet).all()
tweet_words = [t.id for t in db_session.query(Tweet).join(Tweet.words).all()]
tweets = [t for t in tweets if t.id not in tweet_words]

words = db_session.query(Word).all()
all_words = []
tweet_word_dict = {}

count = 0
for t in tweets:
	data = json.loads(t.data)
	search = ParseStatus(data)
	words = wordpunct_tokenize(search.tweetText())
	print len(words)
	count += 1
print count

# 	for w in words:
# 		if w.lower() not in stop:
Ejemplo n.º 12
0
#this is all about words
import re, nltk
from string import punctuation, ascii_lowercase
from itertools import permutations
from patterns import *
from nltk.tokenize import wordpunct_tokenize
nltk.data.path.append('./tweetEasy/nltk_data/') #this may need to change depending on when
from nltk.corpus import stopwords
from collections import Counter
from tweetsql.model import Tweet, Hashtag
from tweetsql.database import db_session
import pickle

tweets = db_session.query(Tweet.tweet).all()
db_hashtags = db_session.query(Hashtag.hashtag).all()

print 'db query done'

word_set = set()
word_count = Counter()
wordco = {} #word co-occcurence dictionary

hashtag_set = set([ht[0].lower() for ht in db_hashtags])
hashtag_count = Counter()
hashtagco = {} #hashtag co-occurence dictionary

cap_words_count = Counter()
acronyms_count = Counter()

#helpers
def makeDict(d, k, v):
Ejemplo n.º 13
0
import json
import nltk
nltk.data.path.append(
    './tweetEasy/nltk_data/')  #this may need to change depending on when
from nltk.corpus import stopwords
from tweetEasy.tweetEasy import ParseStatus
from tweetsql.database import db_session
from tweetsql.model import Hashtag, Tweet, Word, User
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound

stop = stopwords.words('english')
tweets_users = db_session.query(Tweet, User).join(Tweet.user).all()

for t, u in tweets_users:
    data = json.loads(t.data)
    search = ParseStatus(data)
    if (len(t.hashtags) == 0):
        hashtags = search.hashtags
        for h in hashtags:
            try:
                h_obj = db_session.query(Hashtag).filter(
                    Hashtag.hashtag == h).one()
            except MultipleResultsFound:
                pass
            except NoResultFound:
                h_obj = Hashtag(hashtag=h)
                db_session.add(h_obj)
            except OperationalError:
                print 'error'
                db_session.rollback()
Ejemplo n.º 14
0
        remaining = get_rate_limit(t=api)
        if remaining > maxRemaining:
            maxRemaining = remaining
            winner = k
    return winner


#set up twitter api
clean_key = get_clean_key()
twitter_auth = twitter.oauth.OAuth(clean_key.OAUTH_TOKEN,
                                   clean_key.OAUTH_TOKEN_SECRET,
                                   clean_key.CONSUMER_KEY,
                                   clean_key.CONSUMER_SECRET)
api = twitter.Twitter(auth=twitter_auth)

got_data = db_session.query(
    UserData.id).all()  #first get all users in the UseData table
got_data = [t[0] for t in got_data]

# all_users = db_session.query(User.id, User.uid).all()
all_users = db_session.query(Friend.id, Friend.friend_id).all()

no_data = [(pk, uid) for pk, uid in all_users if pk not in got_data]
#group no_data by 100 for our twitter call
no_data_100s = [no_data[i:i + 99] for i in range(0, len(no_data), 100)]

if get_rate_limit(t=api) == 0:
    print 'rate limit hit'
    print 'try again at %d' % get_rate_limit(t=api, data='reset')

for l in no_data_100s:
    if get_rate_limit(t=api) == 0:
Ejemplo n.º 15
0
twitter_auth = twitter.oauth.OAuth(keys.lu.OAUTH_TOKEN, keys.lu.OAUTH_TOKEN_SECRET,
                           keys.lu.CONSUMER_KEY, keys.lu.CONSUMER_SECRET)

twitter_stream = twitter.TwitterStream(auth=twitter_auth)

statuses = twitter_stream.statuses.filter(track=TRACK)


for t in statuses:
    #add user
    try:
        print t['text']
    except:
        print 'no text found'
    try:
        u = db_session.query(User).filter_by(uid=str(t['user']['id'])).one()
    except NoResultFound:
        u = User(screen_name=t['user']['screen_name'], uid=t['user']['id'])
        db_session.add(u)
        db_session.commit()
        print 'user committed'


    #add tweet
    tw = Tweet(tweet=t['text'], tid=t['id'], user_id=u.id, created_at=t['created_at'], data=json.dumps(t))
    db_session.add(tw)
    db_session.commit()
    print 'tweet commited'
    
    #add hashtag
    search = ParseStatus(t)
Ejemplo n.º 16
0
CONSUMER_SECRET = ''

OAUTH_TOKEN = '-'
OAUTH_TOKEN_SECRET = ''

twitter_auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                           CONSUMER_KEY, CONSUMER_SECRET)

twitter_stream = twitter.TwitterStream(auth=twitter_auth)

statuses = twitter_stream.statuses.filter(track='Temple Mount, Palestine')

for t in statuses:
    # print(t['text'])
    try:
        u = db_session.query(User).filter_by(uid=str(t['user']['id'])).one()
    except MultipleResultsFound:
        u = User(screen_name=t['user']['screen_name'], uid=t['user']['id'])
        pass
    except NoResultFound:
        u = User(screen_name=t['user']['screen_name'], uid=t['user']['id'])
        db_session.add(u)
        db_session.commit()

    if t['coordinates']:
        tw = Tweet(tweet=t['text'], tid=t['id'], user_id=u.id, created_at=t['created_at'], coordinates=t['coordinates']['coordinates'])
    else:
        tw = Tweet(tweet=t['text'], tid=t['id'], user_id=u.id, created_at=t['created_at'])

    try:
        words = tw.tweet.split()
Ejemplo n.º 17
0
import json
import nltk
from nltk.tokenize import wordpunct_tokenize
nltk.data.path.append(
    './tweetEasy/nltk_data/')  #this may need to change depending on when
from nltk.corpus import stopwords
from tweetEasy.tweetEasy import ParseStatus
from tweetsql.database import db_session
from tweetsql.model import Hashtag, Tweet, Word, User
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound

stop = stopwords.words('english')
tweets = db_session.query(Tweet).all()
tweet_words = [t.id for t in db_session.query(Tweet).join(Tweet.words).all()]
tweets = [t for t in tweets if t.id not in tweet_words]

words = db_session.query(Word).all()
all_words = []
tweet_word_dict = {}

count = 0
for t in tweets:
    data = json.loads(t.data)
    search = ParseStatus(data)
    words = wordpunct_tokenize(search.tweetText())
    print len(words)
    count += 1
print count

# 	for w in words:
Ejemplo n.º 18
0
#!/usr/bin/env python

import re
from sqlalchemy.sql import text
from tweetsql.database import Base, db_session, engine
from tweetsql.model import Tweet, User, Word

all_media = []
hashtags = []

REGEX = '#(\w+)'
re_hash = re.compile(r'#[0-9a-zA-Z+_]*',re.IGNORECASE);

for t in db_session.query(Tweet)[0:10]: #.filter(text('tweet ~ :reg')).params(reg=REGEX)[0:10]: 
	all_media.append(t.tweet)
	print t.created_at

# 	for w in t.tweet.split():
# 		hashtag = re_hash.match(w)

# 		if(hashtag):
# 			hashtags.append(hashtag.string)
	
print len(all_media)
Ejemplo n.º 19
0
	return winner

#set up twitter api
clean_key = get_clean_key()
twitter_auth = twitter.oauth.OAuth(clean_key.OAUTH_TOKEN, clean_key.OAUTH_TOKEN_SECRET, 
	clean_key.CONSUMER_KEY, clean_key.CONSUMER_SECRET)
api = twitter.Twitter(auth=twitter_auth)

if get_rate_limit(t=api, data='remaining')==0:
	print 'rate limit hit'
	print 'try again at %d' %get_rate_limit(t=api, data='reset')
	sys.exit()


#let's get 15 screen names that are not in the Friend table
got_friends = db_session.query(distinct(Friend.user_id)).all() #first get all users in the friend table
got_friends = [t[0] for t in got_friends]
dead_users = db_session.query(NoUser.user_id).all()
dead_users = [t[0] for t in dead_users]
all_users = db_session.query(User.id, User.uid).all()
all_users = [(pk,uid) for pk,uid in all_users if pk not in dead_users] #filter out dead users
no_friends = [(pk,uid) for pk,uid in all_users if pk not in got_friends] #get rid of people with friends

# rate_limit = 15
# requests = 0
print no_friends[:15]
for pk,uid in no_friends:
	cursor = -1
	friends = []
	if get_rate_limit(t=api, data='remaining')==0:
		print 'rate limit hit'