def get_samples(db, limit, offset=0): """ Returns a combined list of negative and positive samples in a (text, label) format. Arguments: db (str) -- Name of the databse to use. limit (int) -- Amount of samples to retrieve. Keyword Arguments: offset (int) -- Where to start getting samples from. """ conn = db_init(db=db) cursor = conn.cursor() sql = "SELECT text, sentiment FROM item WHERE sentiment = ? LIMIT ? OFFSET ?" if limit < 2: limit = 2 if limit > get_sample_limit(db): limit = get_sample_limit(db) if limit % 2 != 0: limit -= 1 #we want an even number limit = limit / 2 offset = offset / 2 cursor.execute(sql, ["negative", limit, offset]) neg_samples = cursor.fetchall() cursor.execute(sql, ["positive", limit, offset]) pos_samples = cursor.fetchall() return pos_samples + neg_samples
def get_sample_limit(db): """ Returns the limit of samples so that both positive and negative samples will remain balanced. Keyword Arguments: db (str) -- Name of the database to use. """ #this is an expensive operation in case of a large database #therefore we store the limit in redis and use that when we can m = RedisManager() if 'limit' in m.r.keys(): return int(m.r.get('limit')) db = db_init(db=db) cursor = db.cursor() cursor.execute("SELECT COUNT(*) FROM item where sentiment = 'positive'") pos_count = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM item where sentiment = 'negative'") neg_count = cursor.fetchone()[0] if neg_count > pos_count: limit = pos_count else: limit = neg_count m.r.set('limit', limit) return limit
def collect(db_name='', commit_every=1000, max_collect=400000, query_file=''): """ Will continuously populate the sample database if it exists else it will create a new one. Keyword Arguments: db_name (str) -- Custom name for database. commit_every (int) -- Commit to sqlite after commit_every executes. max_collect (int) -- Will stop collecting at this number. query_file (str) -- If query file is provided should be absolute path to text file. """ if not db_name: d = datetime.datetime.now() #if no dbname is provided we'll store a timestamped db name db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day) db = db_init(db=db_name) cursor = db.cursor() queries = {} if query_file: if not os.path.exists(query_file): return "Query file path does not exist." f = open(query_file) words = [line.strip() for line in f.readlines()] label = words[0] for w in words: queries[w] = label else: queries[':)'] = 'positive' queries[':('] = 'negative' #collect on twitter with kral g = stream(query_list=queries.keys(), service_list="twitter") c = 0 for item in g: text = unicode(item['text']) sentiment = queries.get(item['query'], None) if sentiment: try: cursor.execute('INSERT INTO item VALUES (NULL,?,?)', [text, sentiment]) c += 1 if c % commit_every == 0: db.commit() print("Commited {}".format(commit_every)) if c == max_collect: break except IntegrityError: #skip duplicates continue db.close()
def twitter_feed(sentiment, last_id=None, new=False): """ Will return positive and negative tweets from the twitter api. What is flagged as positive and negative iscurrently determined by happy/sad faces. """ db = db_init() cursor = db.cursor() if sentiment == 'positive': query = ':)' elif sentiment == 'negative': query = ':(' else: print('Sentiment must be either positive or negative.') return last_id_url = "http://search.twitter.com/search.json?lang=en&q=%s&since_id=%s" query_url = "http://search.twitter.com/search.json?lang=en&q=%s" if not (last_id or new): cursor.execute('SELECT item_id FROM item WHERE sentiment=? ORDER BY item_id DESC LIMIT 1', [sentiment]) last_id = cursor.fetchone()[0] url = last_id_url % (query, last_id) elif last_id: url = last_id_url % (query, last_id) elif new: url = query_url % query data = [] try: data = json.loads(urllib2.urlopen(url).read()) except: raise if data: items = data['results'] cursor.execute('SELECT COUNT() FROM item WHERE sentiment = ?',[sentiment]) total_rows = cursor.fetchone()[0] for item in items: text = unicode(item['text']) #force unicode for db if text: item_id = item['id'] try: cursor.execute('INSERT INTO item VALUES (NULL,?,?,?)', [item_id, text, sentiment]) last_id = item_id print sentiment, total_rows, text total_rows += 1 except sqlite3.IntegrityError, e: pass #these are duplicates, we don't want duplicates
def fetch(verbose=True): """ Pre-populates training database from public archive of ~2mil tweets """ if not verbose: logger.setLevel(0) response = urllib2.urlopen('https://github.com/downloads/Tawlk/synt/sample_data.bz2') total_bytes = int(response.info().getheader('Content-Length').strip()) saved_bytes = 0 start_time = time.time() last_seconds = 0 last_seconds_start = 0 data_buffer = StringIO() decompressor = bz2.BZ2Decompressor() if os.path.exists(settings.DB_FILE): os.remove(settings.DB_FILE) db = db_init(create=False) db.set_progress_handler(import_progress,20) while True: seconds = (time.time() - start_time) chunk = response.read(8192) if not chunk: break saved_bytes += len(chunk) data_buffer.write(decompressor.decompress(chunk)) if seconds > 1: percent = round((float(saved_bytes) / total_bytes)*100, 2) speed = round((float(total_bytes / seconds ) / 1024),2) speed_type = 'Kb/s' if speed > 1000: speed = round((float(total_bytes / seconds ) / 1048576),2) speed_type = 'Mb/s' if last_seconds >= 0.5: last_seconds = 0 last_seconds_start = time.time() logger.info("Downloaded %d of %d Mb, %s%s (%0.2f%%)\r" % (saved_bytes/1048576, total_bytes/1048576, speed, speed_type, percent)) else: last_seconds = (time.time() - last_seconds_start) if saved_bytes == total_bytes: logger.info("Downloaded %d of %d Mb, %s%s (100%%)\r" % (saved_bytes/1048576, total_bytes/1048576, speed, speed_type)) try: db.executescript(data_buffer.getvalue()) except Exception, e: logger.error("Sqlite3 import failed with: %s" % e) break
def fetch(db_name="samples.db"): """ Pre-populates training database from public archive of ~2mil tweets. Stores training database as db_name in ~/.synt/ Keyword Arguments: db_name (str) -- Custom name for database. """ response = urllib2.urlopen("https://github.com/downloads/Tawlk/synt/sample_data.bz2") total_bytes = int(response.info().getheader("Content-Length").strip()) saved_bytes = 0 start_time = time.time() last_seconds = 0 last_seconds_start = 0 data_buffer = StringIO() decompressor = bz2.BZ2Decompressor() fp = os.path.join(os.path.expanduser(config.SYNT_PATH), db_name) if os.path.exists(fp): os.remove(fp) db = db_init(db=db_name, create=False) db.set_progress_handler(import_progress, 20) while True: seconds = time.time() - start_time chunk = response.read(8192) if not chunk: break saved_bytes += len(chunk) data_buffer.write(decompressor.decompress(chunk)) if seconds > 1: percent = round((float(saved_bytes) / total_bytes) * 100, 2) speed = round((float(total_bytes / seconds) / 1024), 2) speed_type = "Kb/s" if speed > 1000: speed = round((float(total_bytes / seconds) / 1048576), 2) speed_type = "Mb/s" if last_seconds >= 0.5: last_seconds = 0 last_seconds_start = time.time() print( "Downloaded %d of %d Mb, %s%s (%0.2f%%)\r" % (saved_bytes / 1048576, total_bytes / 1048576, speed, speed_type, percent) ) else: last_seconds = time.time() - last_seconds_start if saved_bytes == total_bytes: print( "Downloaded %d of %d Mb, %s%s (100%%)\r" % (saved_bytes / 1048576, total_bytes / 1048576, speed, speed_type) ) try: db.executescript(data_buffer.getvalue()) except Exception, e: print("Sqlite3 import failed with: %s" % e) break
def collect(db_name="", commit_every=1000, max_collect=400000, queries_file=""): """ Will continuously populate the sample database if it exists else it will create a new one. Keyword Arguments: db_name (str) -- Custom name for database. commit_every (int) -- Commit to sqlite after commit_every executes. max_collect (int) -- Will stop collecting at this number. queries_file (str) -- If queries file is provided should be a path to a text file containing the queries in the format: label query1 queryN """ if not db_name: d = datetime.datetime.now() # if no dbname is provided we'll store a timestamped db name db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day) db = db_init(db=db_name) cursor = db.cursor() queries = {} if queries_file: try: f = open(queries_file) words = [line.strip() for line in f.readlines()] label = words[0] for w in words: queries[w] = label except IOError: pass else: queries[":)"] = "positive" queries[":("] = "negative" # collect on twitter with kral g = stream(query_list=queries.keys(), service_list="twitter") c = 0 for item in g: text = unicode(item["text"]) sentiment = queries.get(item["query"], None) if sentiment: try: cursor.execute("INSERT INTO item VALUES (NULL,?,?)", [text, sentiment]) c += 1 if c % commit_every == 0: db.commit() print("Commited {}".format(commit_every)) if c == max_collect: break except IntegrityError: # skip duplicates continue db.close()
def fetch(db_name='samples.db'): """ Pre-populates training database from public archive of ~2mil tweets. Stores training database as db_name in ~/.synt/ Keyword Arguments: db_name (str) -- Custom name for database. """ response = urllib2.urlopen('https://github.com/downloads/Tawlk/synt/sample_data.bz2') total_bytes = int(response.info().getheader('Content-Length').strip()) saved_bytes = 0 start_time = time.time() last_seconds = 0 last_seconds_start = 0 data_buffer = StringIO() decompressor = bz2.BZ2Decompressor() fp = os.path.join(os.path.expanduser(config.SYNT_PATH), db_name) if os.path.exists(fp): os.remove(fp) db = db_init(db=db_name, create=False) db.set_progress_handler(import_progress,20) while True: seconds = (time.time() - start_time) chunk = response.read(8192) if not chunk: break saved_bytes += len(chunk) data_buffer.write(decompressor.decompress(chunk)) if seconds > 1: percent = round((float(saved_bytes) / total_bytes)*100, 2) speed = round((float(total_bytes / seconds ) / 1024),2) speed_type = 'Kb/s' if speed > 1000: speed = round((float(total_bytes / seconds ) / 1048576),2) speed_type = 'Mb/s' if last_seconds >= 0.5: last_seconds = 0 last_seconds_start = time.time() print("Downloaded %d of %d Mb, %s%s (%0.2f%%)\r" % (saved_bytes/1048576, total_bytes/1048576, speed, speed_type, percent)) else: last_seconds = (time.time() - last_seconds_start) if saved_bytes == total_bytes: print("Downloaded %d of %d Mb, %s%s (100%%)\r" % (saved_bytes/1048576, total_bytes/1048576, speed, speed_type)) try: db.executescript(data_buffer.getvalue()) except Exception, e: print("Sqlite3 import failed with: %s" % e) break
def collect(db_name='', commit_every=1000, max_collect=400000, query_file=''): """ Will continuously populate the sample database if it exists else it will create a new one. Keyword Arguments: db_name (str) -- Custom name for database. commit_every (int) -- Commit to sqlite after commit_every executes. max_collect (int) -- Will stop collecting at this number. query_file (str) -- If query file is provided should be absolute path to text file. """ #collect requires kral try: from kral import stream except ImportError: raise ImportError("Requires the kral package in order to collect.") if not db_name: d = datetime.datetime.now() #if no dbname is provided we'll store a timestamped db name db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day) db = db_init(db=db_name) cursor = db.cursor() queries = {} if query_file: if not os.path.exists(query_file): return "Query file path does not exist." f = open(query_file) words = [line.strip() for line in f.readlines()] label = words[0] for w in words: queries[w] = label else: queries[':)'] = 'positive' queries[':('] = 'negative' #collect on twitter with kral g = stream(query_list=queries.keys(), service_list="twitter") c = 0 for item in g: text = unicode(item['text']) sentiment = queries.get(item['query'], None) if sentiment: try: cursor.execute('INSERT INTO item VALUES (NULL,?,?)', [text, sentiment]) c += 1 if c % commit_every == 0: db.commit() print("Commited {}".format(commit_every)) if c == max_collect: break except IntegrityError: #skip duplicates continue db.close()