def user_frequency_tables(db_source, sample=True, tags_too=False): connsourc,connderiv = deriv_db.connect_databases(db_source) cursderiv = connderiv.cursor() required=['user_genres','user_tags'] ct = check_tables(cursderiv,required) if not ct[0] or not ct[1]: for n,r in enumerate(ct): if not r: print 'Could not find {} table.'.format(required[n]) print ('Before calling this function, call user_gt_tables with ' 'path of source database to create necessary tables.') return False if sample: cursderiv.execute('SELECT id FROM sample') sample={c[0] for c in cursderiv.fetchall()} to_do = [('user_genres','genre_popularity')] if tags_too: to_do.append(('user_tags','tag_popularity')) for usertab,poptab in to_do: cursderiv.execute('SELECT user,most_used FROM {}'.format(usertab)) if sample: strings=[s[1] for s in cursderiv.fetchall() if s[0] in sample] else: strings=[s[1] for s in cursderiv.fetchall()] add_data.create_table(cursderiv,poptab) sql=('INSERT INTO {} (string,frequency,rank) ' 'VALUES(?,?,?)'.format(poptab)) cursderiv.executemany(sql,add_ranks(strings,1)) connderiv.commit()
def user_gt_tables(db_source): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() required=['genres','tags'] ct = check_tables(cursderiv,required) if not ct[0] or not ct[1]: for n,r in enumerate(ct): if not r: print 'Could not find {} table.'.format(required[n]) print ('Before calling this function, call gt_tables with ' 'path of source database to create necessary tables.') return False curssourc.execute('SELECT user_id FROM tracks') users=set(curssourc.fetchall()) for colsourc,tabderiv,ranktable in [('genre','user_genres','genres'), ('tag_list','user_tags','tags')]: print 'Now working with: '+ranktable add_data.create_table(cursderiv,tabderiv) add_data.insert_deriv_data(cursderiv,tabderiv, deriv_user_data(curssourc,cursderiv, users,colsourc,ranktable)) connderiv.commit() return True
def create_uploaders_table(cursderiv): add_data.create_table(cursderiv, 'uploaders') sql1 = 'SELECT user_id FROM tracks' sql2 = 'INSERT INTO uploaders VALUES(?)' cursderiv.execute(sql1) ups = set(cursderiv.fetchall()) cursderiv.executemany(sql2, ups)
def user_frequency_tables(db_source, sample=True, tags_too=False): connsourc, connderiv = deriv_db.connect_databases(db_source) cursderiv = connderiv.cursor() required = ['user_genres', 'user_tags'] ct = check_tables(cursderiv, required) if not ct[0] or not ct[1]: for n, r in enumerate(ct): if not r: print 'Could not find {} table.'.format(required[n]) print( 'Before calling this function, call user_gt_tables with ' 'path of source database to create necessary tables.') return False if sample: cursderiv.execute('SELECT id FROM sample') sample = {c[0] for c in cursderiv.fetchall()} to_do = [('user_genres', 'genre_popularity')] if tags_too: to_do.append(('user_tags', 'tag_popularity')) for usertab, poptab in to_do: cursderiv.execute('SELECT user,most_used FROM {}'.format(usertab)) if sample: strings = [s[1] for s in cursderiv.fetchall() if s[0] in sample] else: strings = [s[1] for s in cursderiv.fetchall()] add_data.create_table(cursderiv, poptab) sql = ('INSERT INTO {} (string,frequency,rank) ' 'VALUES(?,?,?)'.format(poptab)) cursderiv.executemany(sql, add_ranks(strings, 1)) connderiv.commit()
def create_uploaders_table(cursderiv): add_data.create_table(cursderiv,'uploaders') sql1='SELECT user_id FROM tracks' sql2='INSERT INTO uploaders VALUES(?)' cursderiv.execute(sql1) ups=set(cursderiv.fetchall()) cursderiv.executemany(sql2,ups)
def copy_and_process_tracks_table(curssourc,cursderiv): add_data.create_table(cursderiv,'tracks') sql1='SELECT * FROM tracks' sql2='INSERT INTO tracks VALUES({})'.format(('?,'*40)[:-1]) cursderiv.executemany(sql2, (process_track_datum(t) for t in curssourc.execute(sql1)))
def create_sample_uploaders_table(cursderiv): add_data.create_table(cursderiv,'sample_uploaders') sql1='SELECT id FROM sample' sql2='SELECT id FROM uploaders' sql3='INSERT INTO sample_uploaders VALUES(?)' cursderiv.execute(sql1) smp=set(cursderiv.fetchall()) cursderiv.execute(sql2) smp_upl=smp & set(cursderiv.fetchall()) cursderiv.executemany(sql3,smp_upl)
def create_sample_uploaders_table(cursderiv): add_data.create_table(cursderiv, 'sample_uploaders') sql1 = 'SELECT id FROM sample' sql2 = 'SELECT id FROM uploaders' sql3 = 'INSERT INTO sample_uploaders VALUES(?)' cursderiv.execute(sql1) smp = set(cursderiv.fetchall()) cursderiv.execute(sql2) smp_upl = smp & set(cursderiv.fetchall()) cursderiv.executemany(sql3, smp_upl)
def user_gt_tables(db_source, sample_only=False,tags_too=False): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() required=['sample','tracks','uploaders','genres','tags'] if False in check_tables(cursderiv,required): return False if sample_only: cursderiv.execute('SELECT id FROM sample') users=cursderiv.fetchall() cursderiv.execute('SELECT id FROM uploaders') users=list(set(users).intersection(set(cursderiv.fetchall()))) else: cursderiv.execute('SELECT id FROM uploaders') users=cursderiv.fetchall() print '{} users to process'.format(len(users)) to_do = [('genre','user_genres','genres')] if tags_too: to_do.append(('tag_list','user_tags','tags')) for colsourc,tabderiv,ranktable in to_do: print 'Now working with: '+ranktable add_data.create_table(cursderiv,tabderiv) print 'Fresh {} table created.'.format(colsourc) print 'Getting track data.' tracks={} sql='SELECT user_id,{} FROM tracks'.format(colsourc) for t in cursderiv.execute(sql): l=split_gt_string(t[1]) if l[0]: try: tracks[t[0]].extend(l) except KeyError: tracks[t[0]]=l print 'Data loaded in memory.' done=0 while done < len(users): to_collect = (user_batch if done+user_batch <= len(users) else len(users)-done) this_batch=users[done:done+to_collect] print 'Starting on a batch of {} users.'.format(to_collect) add_data.insert_deriv_data(cursderiv,tabderiv, deriv_user_data(tracks, cursderiv,this_batch, colsourc,ranktable)) connderiv.commit() done+=to_collect print '{} done. {} remain.'.format(done,len(users)-done) return True
def create_gt_table(curssourc,cursderiv,colsourc,tabderiv): add_data.create_table(cursderiv,tabderiv) entries = (all_genres(curssourc) if tabderiv=='genres' else all_tags(curssourc)) l = [] for e in entries: if e[0]: l.extend(strings_from_string(e[0],colsourc)) sql=('INSERT INTO {} (string,frequency,rank) ' 'VALUES(?,?,?)'.format(tabderiv)) thresh = (genre_threshold if tabderiv == 'genres' else tag_threshold) cursderiv.executemany(sql,add_ranks(l,thresh))
def user_gt_tables(db_source, sample_only=False, tags_too=False): connsourc, connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() required = ['sample', 'tracks', 'uploaders', 'genres', 'tags'] if False in check_tables(cursderiv, required): return False if sample_only: cursderiv.execute('SELECT id FROM sample') users = cursderiv.fetchall() cursderiv.execute('SELECT id FROM uploaders') users = list(set(users).intersection(set(cursderiv.fetchall()))) else: cursderiv.execute('SELECT id FROM uploaders') users = cursderiv.fetchall() print '{} users to process'.format(len(users)) to_do = [('genre', 'user_genres', 'genres')] if tags_too: to_do.append(('tag_list', 'user_tags', 'tags')) for colsourc, tabderiv, ranktable in to_do: print 'Now working with: ' + ranktable add_data.create_table(cursderiv, tabderiv) print 'Fresh {} table created.'.format(colsourc) print 'Getting track data.' tracks = {} sql = 'SELECT user_id,{} FROM tracks'.format(colsourc) for t in cursderiv.execute(sql): l = split_gt_string(t[1]) if l[0]: try: tracks[t[0]].extend(l) except KeyError: tracks[t[0]] = l print 'Data loaded in memory.' done = 0 while done < len(users): to_collect = (user_batch if done + user_batch <= len(users) else len(users) - done) this_batch = users[done:done + to_collect] print 'Starting on a batch of {} users.'.format(to_collect) add_data.insert_deriv_data( cursderiv, tabderiv, deriv_user_data(tracks, cursderiv, this_batch, colsourc, ranktable)) connderiv.commit() done += to_collect print '{} done. {} remain.'.format(done, len(users) - done) return True
def create_gt_table(cursderiv,colsourc,tabderiv,users): add_data.create_table(cursderiv,tabderiv) entries = (all_genres(cursderiv) if tabderiv=='genres' else all_tags(cursderiv)) l = [] for e in entries: if users and e[0] not in users: pass elif e[1]: l.extend(split_gt_string(e[1])) sql=('INSERT INTO {} (string,frequency,rank) ' 'VALUES(?,?,?)'.format(tabderiv)) thresh = (genre_threshold if tabderiv == 'genres' else tag_threshold) cursderiv.executemany(sql,add_ranks(l,thresh))
def create_gt_table(cursderiv, colsourc, tabderiv, users): add_data.create_table(cursderiv, tabderiv) entries = (all_genres(cursderiv) if tabderiv == 'genres' else all_tags(cursderiv)) l = [] for e in entries: if users and e[0] not in users: pass elif e[1]: l.extend(split_gt_string(e[1])) sql = ('INSERT INTO {} (string,frequency,rank) ' 'VALUES(?,?,?)'.format(tabderiv)) thresh = (genre_threshold if tabderiv == 'genres' else tag_threshold) cursderiv.executemany(sql, add_ranks(l, thresh))
def go_for_it(to_process_filepath, db_path, to_backup=True): conn = sqlite3.connect(db_path) curs = conn.cursor() if not grs.check_tables(curs, ['tracks'])[0]: print 'Creating tracks table.' ad.create_table(curs, 'tracks') with open(to_process_filepath, 'r') as to_process: users = [int(u.strip('\n')) for u in to_process] processed_filepath = get_processed_filepath(db_path) if not os.path.exists(processed_filepath): f = open(processed_filepath, 'w') f.close() processed = open(processed_filepath, 'r') for user in processed: user = user.strip('\n') if user: users.remove(int(user)) processed.close() processed = open(processed_filepath, 'a') print 'There are {} users to munch through. Here we go!'.format(len(users)) for n, user in enumerate(users): if to_backup == True and n % 10000 == 0: print 'Backing up...' processed.close() backup(db_path) processed = open(processed_filepath, 'a') get_tracks(curs, user) processed.write('{}\n'.format(user)) processed.flush() # Otherwise, restart becomes unreliable conn.commit() if n < 10 or n + 1 % 100 == 0: print '{} done.'.format(n + 1) if n == 10: print 'Only reporting hundreds from now on.' if to_backup: print 'Backing up...' processed.close() backup(db_path) print 'And we\'re done.' return True
def create_new_tracks_table(curs): add_data.create_table(curs, 'tracks')
def export_data_table(cursor, table_data, table_name): logging.info('Creating '+table_name+' table in DB.') ad.create_table(cursor, table_name) # creates table if not existing already ad.insert_tuple_data_set_into_DB(cursor, table_name, table_data)
def create_tables(curs): for tabl in ['users','tracks','comments','x_follows_y', 'ids_tried','track_ids_tried','comment_ids_tried','sample']: ad.create_table(curs,tabl)
def corpus_table(cursderiv): add_data.create_table(cursderiv,'comments_corp')
def corpus_table(cursderiv): add_data.create_table(cursderiv, 'comments_corp')
def copy_and_process_tracks_table(curssourc, cursderiv): add_data.create_table(cursderiv, 'tracks') sql1 = 'SELECT * FROM tracks' sql2 = 'INSERT INTO tracks VALUES({})'.format(('?,' * 40)[:-1]) cursderiv.executemany(sql2, (process_track_datum(t) for t in curssourc.execute(sql1)))
def create_new_tracks_table(curs): add_data.create_table(curs,'tracks')
def create_table(self,table_name): ad.create_table(self.curs,table_name)
def faves_table(cursderiv): add_data.create_table(cursderiv, 'x_faves_work_of_y')
def copy_sample_table(curssourc, cursderiv): add_data.create_table(cursderiv, 'sample') sql1 = 'SELECT id FROM sample' sql2 = 'INSERT INTO sample VALUES(?)' cursderiv.executemany(sql2, curssourc.execute(sql1))
def create_tables(curs): for tabl in ['users', 'tracks', 'comments']: ad.create_table(curs, tabl)
def faves_table(cursderiv): add_data.create_table(cursderiv,'x_faves_work_of_y')
def create_tables(curs): for tabl in ['users','tracks','comments']: ad.create_table(curs,tabl)
def copy_sample_table(curssourc,cursderiv): add_data.create_table(cursderiv,'sample') sql1='SELECT id FROM sample' sql2='INSERT INTO sample VALUES(?)' cursderiv.executemany(sql2,curssourc.execute(sql1))