def insert_record(connection, tweet, word_list): try: with connection.cursor() as cursor: # Create new record id = tweet['id'] tweet_txt = tweet['text'] cleaned_text = sh.parse_sentence(tweet_txt, word_list) tweet_url = tweet['extended_entities']['media'][0]['media_url'] timestamp = convert_twitter_date_to_datetime(tweet['created_at']) username = tweet['user']['screen_name'] sql = "INSERT INTO Original_tweets (tweet_id, username, text, " \ "processed_text, image_url, created_ts) " \ "VALUES (%s, %s, %s, %s, %s, %s)" cursor.execute(sql, (id, username, tweet_txt, cleaned_text, tweet_url, timestamp)) except: pass
def fetchsamples(needed_sent_val=None, max_iters=1000): word_list = sh.english_word_list() afinn_dict = cs.load_afinn_dictionary('text_sentiment/AFINN-111.txt') huliu_dict = \ cs.load_huliu_dict('text_sentiment/hu_liu/opinion-lexicon-English/') url = "https://stream.twitter.com/1/statuses/sample.json" parameters = [] response = ts.twitterreq(url, "GET", parameters) num_iters = 0 for line in response: if num_iters > max_iters: break if isinstance(line, bytes): line = line.decode('utf-8') # decode if not error message; else wait 1 sec to avoid rate limits try: tweet = json.loads(line.strip()) except: time.sleep(1) print('waiting....') continue # stop processing if tweet doesn't meet basic criteria if not prt.decide_to_include_tweet(tweet): continue if not prt.image_is_original(tweet): continue # Calculate tweet sentiment tweet_txt = tweet['text'] cleaned_text = sh.parse_sentence(tweet_txt, word_list) vader_sent = cs.calculate_vader(cleaned_text) afinn_sent = cs.calculate_simple_sentiment(cleaned_text, afinn_dict) hului_sent = cs.calculate_simple_sentiment(cleaned_text, huliu_dict) consistent = vader_sent == afinn_sent == hului_sent if not consistent: continue if needed_sent_val and (vader_sent != needed_sent_val): continue # retrieve and hash image image_url = tweet['extended_entities']['media'][0]['media_url'] img = fetch_image(image_url) image_hash = dedupe.calculate_image_hash(img) # Ensure not an exact duplicate match = dedupe.find_matching_hash(image_hash, tweet['id']) if match: try: add_dupe_to_db(tweet, match, vader_sent, image_hash, cleaned_text) except Exception as err: print(err) continue # Save image and write info to db try: add_new_record_to_db(tweet, vader_sent, image_hash, cleaned_text) img.save(IMAGE_SAVE_PATH + tweet['id_str'] + '.jpg') except Exception as err: print(err) continue num_iters += 1 return