def fetchsamples(needed_sent_val=None, max_iters=1000): word_list = sh.english_word_list() afinn_dict = cs.load_afinn_dictionary('text_sentiment/AFINN-111.txt') huliu_dict = \ cs.load_huliu_dict('text_sentiment/hu_liu/opinion-lexicon-English/') url = "https://stream.twitter.com/1/statuses/sample.json" parameters = [] response = ts.twitterreq(url, "GET", parameters) num_iters = 0 for line in response: if num_iters > max_iters: break if isinstance(line, bytes): line = line.decode('utf-8') # decode if not error message; else wait 1 sec to avoid rate limits try: tweet = json.loads(line.strip()) except: time.sleep(1) print('waiting....') continue # stop processing if tweet doesn't meet basic criteria if not prt.decide_to_include_tweet(tweet): continue if not prt.image_is_original(tweet): continue # Calculate tweet sentiment tweet_txt = tweet['text'] cleaned_text = sh.parse_sentence(tweet_txt, word_list) vader_sent = cs.calculate_vader(cleaned_text) afinn_sent = cs.calculate_simple_sentiment(cleaned_text, afinn_dict) hului_sent = cs.calculate_simple_sentiment(cleaned_text, huliu_dict) consistent = vader_sent == afinn_sent == hului_sent if not consistent: continue if needed_sent_val and (vader_sent != needed_sent_val): continue # retrieve and hash image image_url = tweet['extended_entities']['media'][0]['media_url'] img = fetch_image(image_url) image_hash = dedupe.calculate_image_hash(img) # Ensure not an exact duplicate match = dedupe.find_matching_hash(image_hash, tweet['id']) if match: try: add_dupe_to_db(tweet, match, vader_sent, image_hash, cleaned_text) except Exception as err: print(err) continue # Save image and write info to db try: add_new_record_to_db(tweet, vader_sent, image_hash, cleaned_text) img.save(IMAGE_SAVE_PATH + tweet['id_str'] + '.jpg') except Exception as err: print(err) continue num_iters += 1 return
""" Clean up missing images """ from Python_code import sql_connect as mysql from Python_code.images import find_duplicates as dupes from PIL import Image import os import platform if platform.platform()[:5] == 'Linux': FILE_PATH = '/home/ec2-user/crowdflower_images/' else: FILE_PATH = '/Volumes/NeuralNet/crowdflower_images/' missing_image = Image.open(FILE_PATH + '694552455.jpg') missing_hash = dupes.calculate_image_hash(missing_image) print(missing_hash) file_list = os.listdir(FILE_PATH) def delete_from_mysql(img_id): connection = mysql.connect() with connection.cursor() as cursor: sql = 'DELETE FROM Crowdflower WHERE image_id = %s' cursor.execute(sql, img_id) connection.commit() connection.close() for file in file_list: