예제 #1
0
def fetchsamples(needed_sent_val=None, max_iters=1000):
    word_list = sh.english_word_list()
    afinn_dict = cs.load_afinn_dictionary('text_sentiment/AFINN-111.txt')
    huliu_dict = \
        cs.load_huliu_dict('text_sentiment/hu_liu/opinion-lexicon-English/')
    url = "https://stream.twitter.com/1/statuses/sample.json"
    parameters = []
    response = ts.twitterreq(url, "GET", parameters)
    num_iters = 0

    for line in response:

        if num_iters > max_iters:
            break

        if isinstance(line, bytes):
            line = line.decode('utf-8')

        # decode if not error message; else wait 1 sec to avoid rate limits
        try:
            tweet = json.loads(line.strip())
        except:
            time.sleep(1)
            print('waiting....')
            continue

        # stop processing if tweet doesn't meet basic criteria
        if not prt.decide_to_include_tweet(tweet):
            continue
        if not prt.image_is_original(tweet):
            continue

        # Calculate tweet sentiment
        tweet_txt = tweet['text']
        cleaned_text = sh.parse_sentence(tweet_txt, word_list)
        vader_sent = cs.calculate_vader(cleaned_text)
        afinn_sent = cs.calculate_simple_sentiment(cleaned_text, afinn_dict)
        hului_sent = cs.calculate_simple_sentiment(cleaned_text, huliu_dict)
        consistent = vader_sent == afinn_sent == hului_sent
        if not consistent:
            continue
        if needed_sent_val and (vader_sent != needed_sent_val):
            continue

        # retrieve and hash image
        image_url = tweet['extended_entities']['media'][0]['media_url']
        img = fetch_image(image_url)
        image_hash = dedupe.calculate_image_hash(img)

        # Ensure not an exact duplicate
        match = dedupe.find_matching_hash(image_hash, tweet['id'])
        if match:
            try:
                add_dupe_to_db(tweet, match, vader_sent,
                               image_hash, cleaned_text)
            except Exception as err:
                print(err)
            continue

        # Save image and write info to db
        try:
            add_new_record_to_db(tweet, vader_sent, image_hash, cleaned_text)
            img.save(IMAGE_SAVE_PATH + tweet['id_str'] + '.jpg')
        except Exception as err:
            print(err)
            continue
        num_iters += 1

    return
예제 #2
0
"""
Clean up missing images
"""

from Python_code import sql_connect as mysql
from Python_code.images import find_duplicates as dupes
from PIL import Image
import os
import platform

if platform.platform()[:5] == 'Linux':
    FILE_PATH = '/home/ec2-user/crowdflower_images/'
else:
    FILE_PATH = '/Volumes/NeuralNet/crowdflower_images/'
missing_image = Image.open(FILE_PATH + '694552455.jpg')
missing_hash = dupes.calculate_image_hash(missing_image)

print(missing_hash)

file_list = os.listdir(FILE_PATH)


def delete_from_mysql(img_id):
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = 'DELETE FROM Crowdflower WHERE image_id = %s'
        cursor.execute(sql, img_id)
    connection.commit()
    connection.close()

for file in file_list: