Example #1
0
def delete_from_mysql(img_id):
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = 'DELETE FROM Crowdflower WHERE image_id = %s'
        cursor.execute(sql, img_id)
    connection.commit()
    connection.close()
Example #2
0
def load_image_data():
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = 'SELECT * from Image_sizes'
        cursor.execute(sql)
        image_data = cursor.fetchall()
    connection.close()
    return pd.DataFrame(image_data)
def get_unique_repeats():
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = 'SELECT DISTINCT primary_tweet ' \
              'FROM Duplicate_images'
        cursor.execute(sql)
        repeat_df = pd.DataFrame(cursor.fetchall())
    connection.close()
    return repeat_df
Example #4
0
def process_duplicate_image(match_id, dupe_id, dupe_hash):
    """
    Updates MySQL database for tweets with a duplicated image as follows:
    1. Adds duplicate tweet info to Duplicate_images table linked to matched id
    2. Updates any entries in Duplicate_images that point to moved tweet
    3. Deletes duplicate tweet record from Original_tweets table
    4. Moves duplicate image to DUPE_IMAGE_PATH
    :param match_id: tweet_id of record to keep in Original_tweets
    :param dupe_id: tweet_id of record to move to Duplicate_images
    :param dupe_hash: hashcode for record being moved
    """
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = "SELECT * FROM Original_tweets " "WHERE tweet_id = %s"
        cursor.execute(sql, int(dupe_id))
        dupe = cursor.fetchone()

        sql = "UPDATE Duplicate_images " "SET primary_tweet = %s " "WHERE primary_tweet = %s"
        cursor.execute(sql, (int(match_id), int(dupe_id)))

        try:
            sql = (
                "INSERT INTO Duplicate_images ( "
                "tweet_id, primary_tweet, username, text, processed_text, "
                "image_url, tweet_sentiment, created_ts, image_hash, "
                "unclear_sentiment) "
                "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            )
            cursor.execute(
                sql,
                (
                    int(dupe["tweet_id"]),
                    int(match_id),
                    dupe["username"],
                    dupe["text"],
                    dupe["processed_text"],
                    dupe["image_url"],
                    int(dupe["tweet_sentiment"]),
                    dupe["created_ts"],
                    dupe_hash,
                    int(dupe["unclear_sentiment"]),
                ),
            )
        except Exception as err:
            print(str(err) + " on record " + str(dupe_id))

        sql = "DELETE FROM Original_tweets " "WHERE tweet_id = %s"
        cursor.execute(sql, int(dupe_id))
    connection.commit()
    connection.close()
    # Move dupe image file
    file_name = str(dupe_id) + ".jpg"
    try:
        os.rename(IMAGE_PATH + file_name, DUPE_IMAGE_PATH + file_name)
    except Exception as err:
        print("error on " + file_name)
        print(err)
def add_to_db(image_id, sentiment, unclear_sentiment, image_url):
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = (
            "INSERT INTO Crowdflower ( " "image_id, sentiment, unclear_sentiment, image_url)" "VALUES (%s, %s, %s, %s)"
        )
        cursor.execute(sql, (image_id, sentiment, unclear_sentiment, image_url))
    connection.commit()
    connection.close()
Example #6
0
def remove_bad_image(tweet_id):
    """
    Removes record from MySQL database
    :param tweet_id: id for tweet to remove
    """
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = 'DELETE FROM Original_tweets WHERE tweet_id = %s'
        cursor.execute(sql, tweet_id)
    connection.commit()
    connection.close()
Example #7
0
def remove_original_tweet(tweet_id):
    """
    Deletes record from Original_tweets table
    :param tweet_id:
    """
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = "DELETE FROM Original_tweets WHERE tweet_id = %s"
        cursor.execute(sql, int(tweet_id))
    connection.commit()
    connection.close()
Example #8
0
def add_hash_to_sql(tweet_id, hashcode):
    """
    Updates specific record in mySQL with image hashcode
    :param tweet_id: integer
    :param hashcode: string
    """
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = "UPDATE Original_tweets " "SET image_hash = %s " "WHERE tweet_id = %s"
        cursor.execute(sql, (hashcode, tweet_id))
    connection.commit()
    connection.close()
Example #9
0
def get_tweet_list():
    """
    Returns a dataframe containing tweet_ids and image hashcodes for all
    recods in Original_tweets table
    :return: pd.DataFrame object
    """
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = "SELECT tweet_id, image_hash FROM Original_tweets"
        cursor.execute(sql)
        results = pd.DataFrame(cursor.fetchall())
    connection.close()
    return results
Example #10
0
def load_tweet_list():
    # open database connection
    connection = mysql.connect()

    # pull record id, username and image url from all downloaded tweets
    with connection.cursor() as cursor:
        sql = "SELECT tweet_id, image_url " \
              "FROM Original_tweets"
        if TESTING:
            sql = sql + ' LIMIT 100'
        cursor.execute(sql)
        tweet_list = cursor.fetchall()
    connection.close()
    return tweet_list
Example #11
0
def write_size(tweet_id, width, height):
    """
    Writes image size to MySQL
    :param tweet_id:
    :param width:
    :param height:
    :return:
    """
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = 'INSERT INTO Image_sizes (tweet_id, width, height, pixels) ' \
              'VALUES (%s, %s, %s, %s)'
        cursor.execute(sql, (tweet_id, width, height, width * height))
        connection.commit()
        connection.close()
def get_all_same(tweet_id):
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = '(SELECT unclear_sentiment, tweet_sentiment ' \
              'FROM Duplicate_images ' \
              'WHERE primary_tweet = %s) ' \
              'UNION ALL' \
              '(SELECT unclear_sentiment, tweet_sentiment ' \
              'FROM Original_tweets ' \
              'WHERE tweet_id = %s)'
        cursor.execute(sql, (tweet_id, tweet_id))
        dupes = pd.DataFrame(cursor.fetchall())
    connection.close()

    return dupes
Example #13
0
def add_new_record_to_db(tweet, sentiment, img_hash, proc_txt):
    timestamp = prt.convert_twitter_date_to_datetime(tweet['created_at'])
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = 'INSERT INTO Original_tweets (' \
              'tweet_id, username, text, processed_text, image_url, ' \
              'tweet_sentiment, unclear_sentiment, created_ts, image_hash) ' \
              'VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s)'
        cursor.execute(sql, (int(tweet['id']),
                             tweet['user']['screen_name'],
                             tweet['text'],
                             proc_txt,
                             tweet['extended_entities']['media'][0]['media_url'],
                             sentiment,
                             timestamp,
                             img_hash))
    connection.commit()
    connection.close()
Example #14
0
def find_matching_hash(hashcode, search_tweet_id):
    """
    Queries database to see if exactly matching hashcode exists for image
    If match exists, returns tweet_id of match
    else returns None
    :param hashcode: hashcode of image being searched on
    :param search_tweet_id: tweet_id of image being searched on
    :return: None or matching tweet_id
    """
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = "SELECT tweet_id from Original_tweets " "WHERE image_hash = %s " "AND tweet_id != %s"
        cursor.execute(sql, (hashcode, search_tweet_id))
        match = cursor.fetchone()
        if match:
            match = match["tweet_id"]
    connection.close()
    return match
Example #15
0
def load_tweet_list():
    """
    Loads all tweets from MySQL, returns as list of dictionaries
    :return tweet_list: list of dictionaries
    """
    # open database connection
    connection = mysql.connect()

    # pull record id, username and image url from all downloaded tweets
    with connection.cursor() as cursor:
        sql = "SELECT tweet_id, image_url FROM Original_tweets "
              # "WHERE tweet_id <= 693431781333278720 " \
              # "AND tweet_id > 692068905158770689"
              # To get early: tweet_id <= 693431781333278720
        if TESTING:
            sql += ' LIMIT 50'
        cursor.execute(sql)
        tweet_list = cursor.fetchall()
    connection.close()
    return tweet_list
Example #16
0
def add_dupe_to_db(dupe_tweet, match_id, dupe_sentiment, img_hash, proc_txt):
    timestamp = prt.convert_twitter_date_to_datetime(dupe_tweet['created_at'])
    connection = mysql.connect()
    with connection.cursor() as cursor:

        sql = 'INSERT INTO Duplicate_images ( ' \
              'tweet_id, primary_tweet, username, text, processed_text, ' \
              'image_url, tweet_sentiment, created_ts, image_hash, ' \
              'unclear_sentiment) ' \
              'VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 0)'
        cursor.execute(sql, (int(dupe_tweet['id']),
                             int(match_id),
                             dupe_tweet['user']['screen_name'],
                             dupe_tweet['text'],
                             proc_txt,
                             dupe_tweet['extended_entities']['media'][0]['media_url'],
                             dupe_sentiment,
                             timestamp,
                             img_hash))
    connection.commit()
    connection.close()
Example #17
0
def get_crowdflower(class_count=1000,
                    image_path='/Volumes/NeuralNet/crowdflower_images/'):
    data = []
    connection = mysql.connect()
    with connection.cursor() as cursor:
        sql = '(SELECT image_id, sentiment FROM Crowdflower ' \
              'WHERE unclear_sentiment = 0 AND SENTIMENT = 1 ' \
              'LIMIT ' + str(class_count) + ') ' \
              'UNION ALL ' \
              '(SELECT image_id, sentiment FROM Crowdflower ' \
              'WHERE unclear_sentiment = 0 AND SENTIMENT = -1 ' \
              'LIMIT ' + str(class_count) + ') '
        cursor.execute(sql)
        results = pd.DataFrame(cursor.fetchall())
    connection.close()

    for image in results['image_id']:
        image = image_path + str(image) + '.jpg'
        img = img_to_flat_matrix(image)
        data.append(img)
    data = np.array(data)
    return data, np.array(results['sentiment'])
"""
Clean up database by removing records in duplicate_images with a null value
for unclear_sentiment

Also delete image files
"""

import os
from Python_code import sql_connect as mysql

connection = mysql.connect()
with connection.cursor() as cursor:
    sql = 'SELECT tweet_id FROM Duplicate_images ' \
          'WHERE unclear_sentiment IS NULL'
    cursor.execute(sql)
    null_records = [x['tweet_id'] for x in cursor.fetchall()]

dupe_path = '/Volumes/NeuralNet/dupe_images/'

for tweet in null_records:
    os.remove(dupe_path + str(tweet) + '.jpg')
with connection.cursor() as cursor:
    sql = 'DELETE FROM Duplicate_images WHERE unclear_sentiment IS NULL'
    cursor.execute(sql)

connection.commit()
connection.close()
Example #19
0
def get_data(class_count=1000, image_path=IMAGE_DIR, rand=True):
    """
    put data into Numpy array
    put category data into list
    requires Database pull
    returns equal number of examples per class
    :param class_count: number of examples of each class to keep
    :param image_path: location of image files
    :param rand: Boolean whether to take random selection or just first 1st n
    """
    data = []
    connection = mysql.connect()
    with connection.cursor() as cursor:
        if rand:
            sql = 'SELECT tweet_id, tweet_sentiment FROM Original_tweets ' \
                  'WHERE unclear_sentiment = 0 AND tweet_sentiment = '

            # Neutral sentiment
            cursor.execute(sql + '0')
            sub_results = pd.DataFrame(cursor.fetchall())
            pct_keep = class_count / len(sub_results)
            np.random.seed(3112016)
            keep = np.random.uniform(0, 1, len(sub_results)) <= pct_keep
            results = sub_results[keep]

            # Positive sentiment
            cursor.execute(sql + '1')
            sub_results = pd.DataFrame(cursor.fetchall())
            pct_keep = class_count / len(sub_results)
            np.random.seed(11032016)
            keep = np.random.uniform(0, 1, len(sub_results)) <= pct_keep
            results = results.append(sub_results[keep])

            # Negative sentiment
            cursor.execute(sql + '-1')
            sub_results = pd.DataFrame(cursor.fetchall())
            pct_keep = class_count / len(sub_results)
            np.random.seed(1132016)
            keep = np.random.uniform(0, 1, len(sub_results)) <= pct_keep
            results = results.append(sub_results[keep])

        else:
            sql = '(SELECT tweet_id, tweet_sentiment FROM Original_tweets ' \
                  'WHERE unclear_sentiment = 0 AND tweet_sentiment = 0 ' \
                  'LIMIT ' + str(class_count) + ') ' \
                  'UNION ALL ' \
                  '(SELECT tweet_id, tweet_sentiment FROM Original_tweets ' \
                  'WHERE unclear_sentiment = 0 AND tweet_sentiment = 1 ' \
                  'LIMIT ' + str(class_count) + ') ' \
                  'UNION ALL ' \
                  '(SELECT tweet_id, tweet_sentiment FROM Original_tweets ' \
                  'WHERE unclear_sentiment = 0 AND tweet_sentiment = -1 ' \
                  'LIMIT ' + str(class_count) + ')'
            cursor.execute(sql)
            results = pd.DataFrame(cursor.fetchall())
    connection.close()

    for image in results['tweet_id']:
        image = image_path + str(image) + '.jpg'
        img = img_to_flat_matrix(image)
        data.append(img)
    data = np.array(data)
    return data, np.array(results['tweet_sentiment'])