def isNewsProfessional(inputPath, outputPath): idN = 1070755071271555072 appN = 6 auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tool = NewsTools(api) userIDs = set() result = [] tweets = None print("Processing..." + inputPath) with open(inputPath) as i_file: tweets = i_file.read().strip().split("\n") for t in tweets: t = json.loads(t) userID = t["user"]["id"] screen_name = t["user"]["screen_name"] if (userID not in userIDs): userIDs.add(userID) isNewsProf = tool.isNewsProfessional(t["user"]["screen_name"]) result.append(screen_name + "\t" + str(isNewsProf)) print(screen_name + " " + str(isNewsProf)) o_file = open(outputPath, "w") o_file.write((" ".join(result)).strip())
def filter(tool, inputFolder, inputFile, outputFolder, handle, maxTweets, appN): print("## Filtering by relevance to @" + handle) inTweets = "" with open(inputFolder + "/" + inputFile) as i_file: inTweets = i_file.read() tweets = inTweets.split("\n") auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) relevantTweets = [] irrelevantTweets = [] for tweet in tweets: print(".", end="") (result, accum) = tool.isRelatedToAgency(json.loads(tweet), handle) if result: relevantTweets.append(tweet) relevantTweets.extend(accum) else: irrelevantTweets.append(tweet) relevantTweets.extend(accum) writeFile(inputFolder, "relevant.json", relevantTweets) writeFile(inputFolder, "irrelevant.json", irrelevantTweets) return (relevantTweets, irrelevantTweets)
def run(qFile, outputFolder, maxTweets, appN): # #Create Query list qlist = "" with open(qFile) as i_file: qlist = i_file.read() print(qlist) queries = qlist.split(",") qFilename = qFile[qFile.rindex("/") + 1:qFile.index(".")] print("## Running scrape on file {a}".format(a=qFilename)) i = appN auth = cred.getAuth(i, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) now = datetime.datetime.now() path = outputFolder + "/" + qFilename + "_" + now.strftime( "%Y-%m-%d-%H-%M") ## Start Scrapping print('## Running scrape /"{keywords}/" on app #{a})'.format( keywords=str(queries), a=i)) ScrapeResults = rest.rest_scrape(queries, api, path + "/by_keywords", int(maxTweets)) allTweets = basics.dedup(path + "/by_keywords", path + "/cleaned", "deduped.json") basics.separateByDate(allTweets, path + "/by_dates")
def filter(inputFolder, inputFile, handle, maxTweets, appN): inTweets = "" with open(inputFolder + "/" + inputFile) as i_file: inTweets = i_file.read() tweets = inTweets.split("\n") auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) relevantTweets = [] irrelevantTweets = [] for tweet in tweets: if news.isRelatedToAgency(api, json.loads(tweet), handle): relevantTweets.append(tweet) else: irrelevantTweets.append(tweet) outpath = inputFolder + "/result" if not os.path.exists(outpath): os.makedirs(outpath) writeFile(outpath, "relevant.json", relevantTweets) writeFile(outpath, "irrelevant.json", irrelevantTweets) clean.dedup(inputFolder, outpath)
def scrape_from_tweets(inputFolder, inputFile, maxTweets, appN): #Create Query list inTweets = "" with open(inputFolder + "/" + inputFile) as i_file: inTweets = i_file.read() tweets = inTweets.split("\n") auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) outpath = inputFolder + "/result" for tweet in tweets: jt = json.loads(tweet) tweet_id = jt["id"] account_handle = jt["user"]["screen_name"] op = outpath tweet_segment = " ".join((jt["full_text"].split(" "))[2:min(10, len(jt["full_text"].split(" "))-2)]) if (tweet_segment != ""): retweets = news.getRetweets(api, op, maxTweets, tweet_segment, tweet_id) try: tweet_url = jt["entities"]["urls"]["url"] quotes = news.getQuotes(api, op, maxTweets, tweet_url, tweet_id) except: print("\t\tCan't find tweet url for id=" + str(tweet_id)) replies_and_others = news.getRepliesAndOthers(api, op, maxTweets, account_handle, tweet_id) clean.clean(outpath, outpath)
def getSourceID(user_id): i = 0 auth = credentials.getAuth(i, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) user = api.get_user(id="_vaguely_") return user.id
def rpr(): idN = 1069475079195713536 appN = 4 auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tool = NewsTools(api) now = datetime.datetime.now() path = "data/@nytimes_opinion_" + str(idN) # "data/" + str(idN) + "_" + now.strftime("%Y-%m-%d-%H-%M") # get tweet object by ID tweet = api.get_status(id=idN, tweet_mode='extended') print("Created at: " + tweet._json["created_at"]) print("Retweet count = " + str(tweet._json["retweet_count"])) print("favorite count = " + str(tweet._json["favorite_count"])) scraped = [] terms = [] terms.append("A few genetically modified people already walk among us.") terms.append("nytimes genetically") terms.append("nytimes modified") terms.append("nytimes china") terms.append("https://twitter.com/nytimes/status/1069475079195713536") # full url of tweet terms.append("https://t.co/kxRdkjfFDc") for t in terms: scraped.extend(rest.rest_scrape_single(t, 1000000, api)) scraped = list(map(lambda x: x._json, scraped)) retweets = [] quotes = [] replies = [] retweets_seen, quotes_seen, replies_seen = set(), set(), set() for t in scraped: if "retweeted_status" in t and t["retweeted_status"]["id"] == idN: if t["id"] not in retweets_seen: retweets_seen.add(t["id"]) retweets.append(t) if "quoted_status" in t and t["quoted_status"]["id"] == idN: if t["id"] not in quotes_seen: quotes_seen.add(t["id"]) quotes.append(t) if t["in_reply_to_status_id"] != None: if t["in_reply_to_status_id"] in replies_seen or t["in_reply_to_status_id"] in quotes_seen or t["in_reply_to_status_id"] in retweets_seen: if t["id"] not in replies_seen: replies_seen.add(t["id"]) replies.append(t) writeFile(path, "retweets.json", retweets) writeFile(path, "quotes.json", quotes) writeFile(path, "replies.json", replies) readAndCategorize(path + "/@nytimes", idN, retweets_seen, quotes_seen, replies_seen, path+"/retweets.json", path+"/quotes.json", path+"/replies.json")
def printThread(path, startN, endN, appN): input_str = "" with open(path) as i_file: input_str = i_file.read() tweets_l = input_str.split("\n") folderName = path[:path.rindex(".")] + "_thread" if not os.path.exists(folderName): os.makedirs(folderName) count = 0 size = 10 fileCount = 0 filename = "thread_" + str(count) + "_" + str(count + size) + ".txt" f = open(folderName + "/" + filename, "w") f.write( "Extracted threads from {a} and writing into {b}\n\n---\n---\n---\n\n". format(a=folderName, b=filename)) i = appN auth = cred.getAuth(i, "user") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tool = NewsTools(api) agents = {} for t_str in tweets_l[startN:endN]: print("woop") t = json.loads(t_str) f.write("\n-\n") a = printTweet(tool, t, f, agents) agents.update(a) count += 1 if count >= size: count = 0 fileCount += 1 filename = "thread_" + str(startN + fileCount * size) + "_" + str( startN + fileCount * size + size) + ".txt" f = open(folderName + "/" + filename, "w") f.write( "Extracted threads from {a} and writing into {b}\n\n---\n---\n---\n\n" .format(a=folderName, b=filename)) filename_a = "thread_agents.txt" f = None f = open(folderName + "/" + filename_a, "w") f.write(json.dumps(agents))
def run(handle, qFile, maxTweets, appN): #Create Query list qlist = "" with open(qFile) as i_file: qlist = i_file.read() qlist = qlist.split(",") queries = [] for q in qlist: queries.append(handle + " " + q) qFilename = qFile[qFile.rindex("/") + 1:qFile.index(".")] print("## Running scrape on file {a}".format(a=qFilename)) i = appN auth = cred.getAuth(i, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) now = datetime.datetime.now() path = "data/" + handle + "_" + qFilename + "/" + now.strftime( "%Y-%m-%d-%H-%M") ## Start Scrapping print('## Running scrape \"{keywords}\" on app #{a})'.format( keywords=str(queries), a=i)) ScrapeResults = rest.rest_scrape(queries, api, path + "/by_keywords", int(maxTweets)) ## Clean up i = appN auth = cred.getAuth(i, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
def filter(inputFolder, inputFile, maxTweets, appN): inTweets = "" with open(inputFolder + "/" + inputFile) as i_file: inTweets = i_file.read() tweets = inTweets.split("\n") auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) outpath = inputFolder + "/result" for tweet in tweets:
def main(argv): print("\nRunning...") idN = int(argv[0]) i = int(argv[1]) auth = credentials.getAuth(i, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tweet = gen.getTweet(idN, api) print(tweet)
def getFollowers(user_id): i = 0 auth = credentials.getAuth(i, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) followers = [] for page in tweepy.Cursor(api.followers_ids, id=user_id, tweet_mode='extended').pages(): followers.extend(page) return followers
def bigFile(term): appN = 4 auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) now = datetime.datetime.now() path = "data/" + term + "_" + now.strftime("%Y-%m-%d-%H-%M") rest.rest_scrape([term], api, path, 100000000, file_size=100000, fileName=None, max_num_errors=5)
def one(): idN = 1070755071271555072 appN = 6 auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tool = NewsTools(api) # get tweet object by ID tweet = api.get_status(id=idN, tweet_mode='extended') print(str(tweet._json)) o_file = open("this.json", "w") o_file.write(str(tweet._json))
def run(queryFile, appNumber): #Create Query list queries = "" with open(queryFile) as i_file: queries = i_file.read() queries_list = queries.split(",") # choose which Twitter app to use for this query i = appNumber print('Streaming for ' + queryFile) auth = cred.getAuth(i, "user") now = datetime.now() out_Path = "data/" + now.strftime("%Y-%m-%d-%H-%M") + "-stream" stream.stream_tweets(auth, queries_list, out_Path)
def getTimeline(user_id): i = 0 auth = credentials.getAuth(i, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tweets = [] for page in tweepy.Cursor(api.user_timeline, id=user_id, tweet_mode='extended', count=20).pages(): tweets.extend(page) tweet_str_list = [] for t in tweets: tweet_str_list.append(t._json) return tweet_str_list
def scrape_from_id(inputFolder, inputFile, maxTweets, appN): #Create Query list ids = "" with open(inputFolder + "/" + inputFile) as i_file: inTweets = i_file.read() ids = ids.split("\n") auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) outpath = inputFolder + "/result" for curID in ids: tweet = None try: tweet = api.get_status(id=int(curID), tweet_mode='extended') except: print("get_status error with id=" + curID) jt = tweet._json tweet_id = jt["id"] account_handle = jt["user"]["screen_name"] tweet_segment = " ".join((jt["full_text"].split(" "))[2:]) op = outpath retweets = news.getRetweets(api, op, maxTweets, tweet_segment, tweet_id) try: tweet_url = jt["entities"]["urls"]["url"] quotes = news.getQuotes(api, op, maxTweets, tweet_url, tweet_id) except: print("Can't find tweet url for id=" + str(tweet_id)) replies_and_others = news.getRepliesAndOthers(api, op, maxTweets, account_handle, tweet_id) clean.clean(outpath, outpath)
def getSource(inputFolder, outputFolder, appNumber): auth = cred.getAuth(appNumber, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) if not os.path.exists(outputFolder + "/NEW/"): os.makedirs(outputFolder + "/NEW/") f = open(outputFolder + "/NEW/source.json", "w") for (dirpath, dirnames, filenames) in os.walk(inputFolder): for filename in filenames: if filename.endswith('.json'): print("Currently on " + filename) with open(dirpath + "/" + filename) as i_file: iDs = set() for line in i_file: t = json.loads(line) f.close()
def network(idN, terms, path, appN): auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) now = datetime.datetime.now() path = path + "/" + str(idN) + "/" + now.strftime("%Y-%m-%d-%H-%M") # get tweet object by ID tweet = api.get_status(id=idN, tweet_mode='extended') print("Created at: " + tweet._json["created_at"]) print("Retweet count = " + str(tweet._json["retweet_count"])) print("Favorite count = " + str(tweet._json["favorite_count"])) scraped = [] for t in terms: scraped.extend(rest.rest_scrape_single(t, 1000000, api)) scraped = list(map(lambda x: x._json, scraped)) basics.writeFile(path, "all.json", list(map(lambda x: json.dumps(x), scraped))) retweets = [] quotes = [] replies = [] retweets_seen, quotes_seen, replies_seen = set(), set(), set() for t in scraped: if "retweeted_status" in t and t["retweeted_status"]["id"] == idN: if t["id"] not in retweets_seen: retweets_seen.add(t["id"]) retweets.append(t) if "quoted_status" in t and t["quoted_status"]["id"] == idN: if t["id"] not in quotes_seen: quotes_seen.add(t["id"]) quotes.append(t) if t["in_reply_to_status_id"] != None: if t["in_reply_to_status_id"] in replies_seen or t[ "in_reply_to_status_id"] in quotes_seen or t[ "in_reply_to_status_id"] in retweets_seen: if t["id"] not in replies_seen: replies_seen.add(t["id"]) replies.append(t) ids = list(map(lambda x: x["id"], retweets)) ids.extend(list(map(lambda x: x["id"], quotes))) ids.extend(list(map(lambda x: x["id"], replies))) ids = list(map(lambda x: str(x), ids)) ids.append(str(idN)) basics.writeFile(path, "ids.csv", [",".join(ids)]) basics.writeFile(path, "retweets.json", list(map(lambda x: json.dumps(x), retweets))) basics.writeFile(path, "quotes.json", list(map(lambda x: json.dumps(x), quotes))) basics.writeFile(path, "replies.json", list(map(lambda x: json.dumps(x), replies)))
def rpr(): idN = 1067738896979644416 appN = 3 auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tool = NewsTools(api) now = datetime.datetime.now() path = "data/@nytimes_" + str( idN) # "data/" + str(idN) + "_" + now.strftime("%Y-%m-%d-%H-%M") # get tweet object by ID tweet = api.get_status(id=idN, tweet_mode='extended') print("Created at: " + tweet._json["created_at"]) print("Retweet count = " + str(tweet._json["retweet_count"])) print("favorite count = " + str(tweet._json["favorite_count"])) scraped = [] terms = [] terms.append("I feel proud, actually") terms.append("nytimes genetically") terms.append("nytimes modified") terms.append("https://twitter.com/nytimes/status/1067738896979644416" ) # full url of tweet terms.append("https://t.co/9v4nxxN96s") for t in terms: scraped.extend(rest.rest_scrape_single(t, 1000000, api)) scraped = list(map(lambda x: x._json, scraped)) retweets = [] quotes = [] replies = [] retweets_seen, quotes_seen, replies_seen = set(), set(), set() for t in scraped: if "retweeted_status" in t and t["retweeted_status"]["id"] == idN: if t["id"] not in retweets_seen: retweets_seen.add(t["id"]) retweets.append(t) if "quoted_status" in t and t["quoted_status"]["id"] == idN: if t["id"] not in quotes_seen: quotes_seen.add(t["id"]) quotes.append(t) if t["in_reply_to_status_id"] != None: if t["in_reply_to_status_id"] in replies_seen or t[ "in_reply_to_status_id"] in quotes_seen or t[ "in_reply_to_status_id"] in retweets_seen: if t["id"] not in replies_seen: replies_seen.add(t["id"]) replies.append(t) writeFile(path, "retweets.json", retweets) writeFile(path, "quotes.json", quotes) writeFile(path, "replies.json", replies)
def rpr(): idN = 1067244642155094019 # "A Chinese scientist claimed he created the world’s first genetically edited babies, a step ethicists have feared. But he offered no proof." appN = 4 auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tool = NewsTools(api) now = datetime.datetime.now() path = "data/@nytimes_factual_" + str( idN) # "data/" + str(idN) + "_" + now.strftime("%Y-%m-%d-%H-%M") # get tweet object by ID tweet = api.get_status(id=idN, tweet_mode='extended') print("Created at: " + tweet._json["created_at"]) print("Retweet count = " + str(tweet._json["retweet_count"])) print("favorite count = " + str(tweet._json["favorite_count"])) scraped = [] terms = [] terms.append("A Chinese scientist claimed he created") terms.append("nytimes genetically") terms.append("nytimes modified") terms.append("https://twitter.com/nytimes/status/1067244642155094019" ) # full url of tweet terms.append("https://t.co/z0fChZxgyM") for t in terms: scraped.extend(rest.rest_scrape_single(t, 1000000, api)) scraped = list(map(lambda x: x._json, scraped)) retweets = [] quotes = [] replies = [] retweets_seen, quotes_seen, replies_seen = set(), set(), set() for t in scraped: if "retweeted_status" in t and t["retweeted_status"]["id"] == idN: if t["id"] not in retweets_seen: retweets_seen.add(t["id"]) retweets.append(t) if "quoted_status" in t and t["quoted_status"]["id"] == idN: if t["id"] not in quotes_seen: quotes_seen.add(t["id"]) quotes.append(t) if t["in_reply_to_status_id"] != None: if t["in_reply_to_status_id"] in replies_seen or t[ "in_reply_to_status_id"] in quotes_seen or t[ "in_reply_to_status_id"] in retweets_seen: if t["id"] not in replies_seen: replies_seen.add(t["id"]) replies.append(t) writeFile(path, "self.json", [json.dumps(tweet._json)]) writeFile(path, "retweets.json", retweets) writeFile(path, "quotes.json", quotes) writeFile(path, "replies.json", replies) readAndCategorize(path + "/@nytimes", idN, retweets_seen, quotes_seen, replies_seen, path + "/retweets.json", path + "/quotes.json", path + "/replies.json")
def rpr(): idN = 1069463438391459840 appN = 3 auth = cred.getAuth(appN, "app") api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tool = NewsTools(api) now = datetime.datetime.now() path = "data/@nytopinion_" + str( idN) # "data/" + str(idN) + "_" + now.strftime("%Y-%m-%d-%H-%M") # get tweet object by ID tweet = api.get_status(id=idN, tweet_mode='extended') print("Created at: " + tweet._json["created_at"]) print("Retweet count = " + str(tweet._json["retweet_count"])) print("favorite count = " + str(tweet._json["favorite_count"])) scraped = [] terms = [] terms.append("He was all noblesse oblige") terms.append("nytopinion Bush") terms.append("nytimes Bush") terms.append("https://twitter.com/nytopinion/status/1069463438391459840" ) # full url of tweet terms.append("https://t.co/pDfFORy45z") for t in terms: scraped.extend(rest.rest_scrape_single(t, 1000000, api)) scraped = list(map(lambda x: x._json, scraped)) retweets = [] quotes = [] replies = [] retweets_seen, quotes_seen, replies_seen = set(), set(), set() for t in scraped: if "retweeted_status" in t and t["retweeted_status"]["id"] == idN: if t["id"] not in retweets_seen: retweets_seen.add(t["id"]) retweets.append(t) if "quoted_status" in t and t["quoted_status"]["id"] == idN: if t["id"] not in quotes_seen: quotes_seen.add(t["id"]) quotes.append(t) if t["in_reply_to_status_id"] != None: if t["in_reply_to_status_id"] in replies_seen or t[ "in_reply_to_status_id"] in quotes_seen or t[ "in_reply_to_status_id"] in retweets_seen: if t["id"] not in replies_seen: replies_seen.add(t["id"]) replies.append(t) writeFile(path, "retweets.json", retweets) writeFile(path, "quotes.json", quotes) writeFile(path, "replies.json", replies) readAndCategorize(path + "/@nytopinion", idN, retweets_seen, quotes_seen, replies_seen, path + "/retweets.json", path + "/quotes.json", path + "/replies.json")