def delete_from_mysql(img_id): connection = mysql.connect() with connection.cursor() as cursor: sql = 'DELETE FROM Crowdflower WHERE image_id = %s' cursor.execute(sql, img_id) connection.commit() connection.close()
def load_image_data(): connection = mysql.connect() with connection.cursor() as cursor: sql = 'SELECT * from Image_sizes' cursor.execute(sql) image_data = cursor.fetchall() connection.close() return pd.DataFrame(image_data)
def get_unique_repeats(): connection = mysql.connect() with connection.cursor() as cursor: sql = 'SELECT DISTINCT primary_tweet ' \ 'FROM Duplicate_images' cursor.execute(sql) repeat_df = pd.DataFrame(cursor.fetchall()) connection.close() return repeat_df
def process_duplicate_image(match_id, dupe_id, dupe_hash): """ Updates MySQL database for tweets with a duplicated image as follows: 1. Adds duplicate tweet info to Duplicate_images table linked to matched id 2. Updates any entries in Duplicate_images that point to moved tweet 3. Deletes duplicate tweet record from Original_tweets table 4. Moves duplicate image to DUPE_IMAGE_PATH :param match_id: tweet_id of record to keep in Original_tweets :param dupe_id: tweet_id of record to move to Duplicate_images :param dupe_hash: hashcode for record being moved """ connection = mysql.connect() with connection.cursor() as cursor: sql = "SELECT * FROM Original_tweets " "WHERE tweet_id = %s" cursor.execute(sql, int(dupe_id)) dupe = cursor.fetchone() sql = "UPDATE Duplicate_images " "SET primary_tweet = %s " "WHERE primary_tweet = %s" cursor.execute(sql, (int(match_id), int(dupe_id))) try: sql = ( "INSERT INTO Duplicate_images ( " "tweet_id, primary_tweet, username, text, processed_text, " "image_url, tweet_sentiment, created_ts, image_hash, " "unclear_sentiment) " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" ) cursor.execute( sql, ( int(dupe["tweet_id"]), int(match_id), dupe["username"], dupe["text"], dupe["processed_text"], dupe["image_url"], int(dupe["tweet_sentiment"]), dupe["created_ts"], dupe_hash, int(dupe["unclear_sentiment"]), ), ) except Exception as err: print(str(err) + " on record " + str(dupe_id)) sql = "DELETE FROM Original_tweets " "WHERE tweet_id = %s" cursor.execute(sql, int(dupe_id)) connection.commit() connection.close() # Move dupe image file file_name = str(dupe_id) + ".jpg" try: os.rename(IMAGE_PATH + file_name, DUPE_IMAGE_PATH + file_name) except Exception as err: print("error on " + file_name) print(err)
def add_to_db(image_id, sentiment, unclear_sentiment, image_url): connection = mysql.connect() with connection.cursor() as cursor: sql = ( "INSERT INTO Crowdflower ( " "image_id, sentiment, unclear_sentiment, image_url)" "VALUES (%s, %s, %s, %s)" ) cursor.execute(sql, (image_id, sentiment, unclear_sentiment, image_url)) connection.commit() connection.close()
def remove_bad_image(tweet_id): """ Removes record from MySQL database :param tweet_id: id for tweet to remove """ connection = mysql.connect() with connection.cursor() as cursor: sql = 'DELETE FROM Original_tweets WHERE tweet_id = %s' cursor.execute(sql, tweet_id) connection.commit() connection.close()
def remove_original_tweet(tweet_id): """ Deletes record from Original_tweets table :param tweet_id: """ connection = mysql.connect() with connection.cursor() as cursor: sql = "DELETE FROM Original_tweets WHERE tweet_id = %s" cursor.execute(sql, int(tweet_id)) connection.commit() connection.close()
def add_hash_to_sql(tweet_id, hashcode): """ Updates specific record in mySQL with image hashcode :param tweet_id: integer :param hashcode: string """ connection = mysql.connect() with connection.cursor() as cursor: sql = "UPDATE Original_tweets " "SET image_hash = %s " "WHERE tweet_id = %s" cursor.execute(sql, (hashcode, tweet_id)) connection.commit() connection.close()
def get_tweet_list(): """ Returns a dataframe containing tweet_ids and image hashcodes for all recods in Original_tweets table :return: pd.DataFrame object """ connection = mysql.connect() with connection.cursor() as cursor: sql = "SELECT tweet_id, image_hash FROM Original_tweets" cursor.execute(sql) results = pd.DataFrame(cursor.fetchall()) connection.close() return results
def load_tweet_list(): # open database connection connection = mysql.connect() # pull record id, username and image url from all downloaded tweets with connection.cursor() as cursor: sql = "SELECT tweet_id, image_url " \ "FROM Original_tweets" if TESTING: sql = sql + ' LIMIT 100' cursor.execute(sql) tweet_list = cursor.fetchall() connection.close() return tweet_list
def write_size(tweet_id, width, height): """ Writes image size to MySQL :param tweet_id: :param width: :param height: :return: """ connection = mysql.connect() with connection.cursor() as cursor: sql = 'INSERT INTO Image_sizes (tweet_id, width, height, pixels) ' \ 'VALUES (%s, %s, %s, %s)' cursor.execute(sql, (tweet_id, width, height, width * height)) connection.commit() connection.close()
def get_all_same(tweet_id): connection = mysql.connect() with connection.cursor() as cursor: sql = '(SELECT unclear_sentiment, tweet_sentiment ' \ 'FROM Duplicate_images ' \ 'WHERE primary_tweet = %s) ' \ 'UNION ALL' \ '(SELECT unclear_sentiment, tweet_sentiment ' \ 'FROM Original_tweets ' \ 'WHERE tweet_id = %s)' cursor.execute(sql, (tweet_id, tweet_id)) dupes = pd.DataFrame(cursor.fetchall()) connection.close() return dupes
def add_new_record_to_db(tweet, sentiment, img_hash, proc_txt): timestamp = prt.convert_twitter_date_to_datetime(tweet['created_at']) connection = mysql.connect() with connection.cursor() as cursor: sql = 'INSERT INTO Original_tweets (' \ 'tweet_id, username, text, processed_text, image_url, ' \ 'tweet_sentiment, unclear_sentiment, created_ts, image_hash) ' \ 'VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s)' cursor.execute(sql, (int(tweet['id']), tweet['user']['screen_name'], tweet['text'], proc_txt, tweet['extended_entities']['media'][0]['media_url'], sentiment, timestamp, img_hash)) connection.commit() connection.close()
def find_matching_hash(hashcode, search_tweet_id): """ Queries database to see if exactly matching hashcode exists for image If match exists, returns tweet_id of match else returns None :param hashcode: hashcode of image being searched on :param search_tweet_id: tweet_id of image being searched on :return: None or matching tweet_id """ connection = mysql.connect() with connection.cursor() as cursor: sql = "SELECT tweet_id from Original_tweets " "WHERE image_hash = %s " "AND tweet_id != %s" cursor.execute(sql, (hashcode, search_tweet_id)) match = cursor.fetchone() if match: match = match["tweet_id"] connection.close() return match
def load_tweet_list(): """ Loads all tweets from MySQL, returns as list of dictionaries :return tweet_list: list of dictionaries """ # open database connection connection = mysql.connect() # pull record id, username and image url from all downloaded tweets with connection.cursor() as cursor: sql = "SELECT tweet_id, image_url FROM Original_tweets " # "WHERE tweet_id <= 693431781333278720 " \ # "AND tweet_id > 692068905158770689" # To get early: tweet_id <= 693431781333278720 if TESTING: sql += ' LIMIT 50' cursor.execute(sql) tweet_list = cursor.fetchall() connection.close() return tweet_list
def add_dupe_to_db(dupe_tweet, match_id, dupe_sentiment, img_hash, proc_txt): timestamp = prt.convert_twitter_date_to_datetime(dupe_tweet['created_at']) connection = mysql.connect() with connection.cursor() as cursor: sql = 'INSERT INTO Duplicate_images ( ' \ 'tweet_id, primary_tweet, username, text, processed_text, ' \ 'image_url, tweet_sentiment, created_ts, image_hash, ' \ 'unclear_sentiment) ' \ 'VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 0)' cursor.execute(sql, (int(dupe_tweet['id']), int(match_id), dupe_tweet['user']['screen_name'], dupe_tweet['text'], proc_txt, dupe_tweet['extended_entities']['media'][0]['media_url'], dupe_sentiment, timestamp, img_hash)) connection.commit() connection.close()
def get_crowdflower(class_count=1000, image_path='/Volumes/NeuralNet/crowdflower_images/'): data = [] connection = mysql.connect() with connection.cursor() as cursor: sql = '(SELECT image_id, sentiment FROM Crowdflower ' \ 'WHERE unclear_sentiment = 0 AND SENTIMENT = 1 ' \ 'LIMIT ' + str(class_count) + ') ' \ 'UNION ALL ' \ '(SELECT image_id, sentiment FROM Crowdflower ' \ 'WHERE unclear_sentiment = 0 AND SENTIMENT = -1 ' \ 'LIMIT ' + str(class_count) + ') ' cursor.execute(sql) results = pd.DataFrame(cursor.fetchall()) connection.close() for image in results['image_id']: image = image_path + str(image) + '.jpg' img = img_to_flat_matrix(image) data.append(img) data = np.array(data) return data, np.array(results['sentiment'])
""" Clean up database by removing records in duplicate_images with a null value for unclear_sentiment Also delete image files """ import os from Python_code import sql_connect as mysql connection = mysql.connect() with connection.cursor() as cursor: sql = 'SELECT tweet_id FROM Duplicate_images ' \ 'WHERE unclear_sentiment IS NULL' cursor.execute(sql) null_records = [x['tweet_id'] for x in cursor.fetchall()] dupe_path = '/Volumes/NeuralNet/dupe_images/' for tweet in null_records: os.remove(dupe_path + str(tweet) + '.jpg') with connection.cursor() as cursor: sql = 'DELETE FROM Duplicate_images WHERE unclear_sentiment IS NULL' cursor.execute(sql) connection.commit() connection.close()
def get_data(class_count=1000, image_path=IMAGE_DIR, rand=True): """ put data into Numpy array put category data into list requires Database pull returns equal number of examples per class :param class_count: number of examples of each class to keep :param image_path: location of image files :param rand: Boolean whether to take random selection or just first 1st n """ data = [] connection = mysql.connect() with connection.cursor() as cursor: if rand: sql = 'SELECT tweet_id, tweet_sentiment FROM Original_tweets ' \ 'WHERE unclear_sentiment = 0 AND tweet_sentiment = ' # Neutral sentiment cursor.execute(sql + '0') sub_results = pd.DataFrame(cursor.fetchall()) pct_keep = class_count / len(sub_results) np.random.seed(3112016) keep = np.random.uniform(0, 1, len(sub_results)) <= pct_keep results = sub_results[keep] # Positive sentiment cursor.execute(sql + '1') sub_results = pd.DataFrame(cursor.fetchall()) pct_keep = class_count / len(sub_results) np.random.seed(11032016) keep = np.random.uniform(0, 1, len(sub_results)) <= pct_keep results = results.append(sub_results[keep]) # Negative sentiment cursor.execute(sql + '-1') sub_results = pd.DataFrame(cursor.fetchall()) pct_keep = class_count / len(sub_results) np.random.seed(1132016) keep = np.random.uniform(0, 1, len(sub_results)) <= pct_keep results = results.append(sub_results[keep]) else: sql = '(SELECT tweet_id, tweet_sentiment FROM Original_tweets ' \ 'WHERE unclear_sentiment = 0 AND tweet_sentiment = 0 ' \ 'LIMIT ' + str(class_count) + ') ' \ 'UNION ALL ' \ '(SELECT tweet_id, tweet_sentiment FROM Original_tweets ' \ 'WHERE unclear_sentiment = 0 AND tweet_sentiment = 1 ' \ 'LIMIT ' + str(class_count) + ') ' \ 'UNION ALL ' \ '(SELECT tweet_id, tweet_sentiment FROM Original_tweets ' \ 'WHERE unclear_sentiment = 0 AND tweet_sentiment = -1 ' \ 'LIMIT ' + str(class_count) + ')' cursor.execute(sql) results = pd.DataFrame(cursor.fetchall()) connection.close() for image in results['tweet_id']: image = image_path + str(image) + '.jpg' img = img_to_flat_matrix(image) data.append(img) data = np.array(data) return data, np.array(results['tweet_sentiment'])