def user_frequency_tables(db_source, sample=True, tags_too=False): connsourc, connderiv = deriv_db.connect_databases(db_source) cursderiv = connderiv.cursor() required = ['user_genres', 'user_tags'] ct = check_tables(cursderiv, required) if not ct[0] or not ct[1]: for n, r in enumerate(ct): if not r: print 'Could not find {} table.'.format(required[n]) print( 'Before calling this function, call user_gt_tables with ' 'path of source database to create necessary tables.') return False if sample: cursderiv.execute('SELECT id FROM sample') sample = {c[0] for c in cursderiv.fetchall()} to_do = [('user_genres', 'genre_popularity')] if tags_too: to_do.append(('user_tags', 'tag_popularity')) for usertab, poptab in to_do: cursderiv.execute('SELECT user,most_used FROM {}'.format(usertab)) if sample: strings = [s[1] for s in cursderiv.fetchall() if s[0] in sample] else: strings = [s[1] for s in cursderiv.fetchall()] add_data.create_table(cursderiv, poptab) sql = ('INSERT INTO {} (string,frequency,rank) ' 'VALUES(?,?,?)'.format(poptab)) cursderiv.executemany(sql, add_ranks(strings, 1)) connderiv.commit()
def user_gt_tables(db_source): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() required=['genres','tags'] ct = check_tables(cursderiv,required) if not ct[0] or not ct[1]: for n,r in enumerate(ct): if not r: print 'Could not find {} table.'.format(required[n]) print ('Before calling this function, call gt_tables with ' 'path of source database to create necessary tables.') return False curssourc.execute('SELECT user_id FROM tracks') users=set(curssourc.fetchall()) for colsourc,tabderiv,ranktable in [('genre','user_genres','genres'), ('tag_list','user_tags','tags')]: print 'Now working with: '+ranktable add_data.create_table(cursderiv,tabderiv) add_data.insert_deriv_data(cursderiv,tabderiv, deriv_user_data(curssourc,cursderiv, users,colsourc,ranktable)) connderiv.commit() return True
def user_frequency_tables(db_source, sample=True, tags_too=False): connsourc,connderiv = deriv_db.connect_databases(db_source) cursderiv = connderiv.cursor() required=['user_genres','user_tags'] ct = check_tables(cursderiv,required) if not ct[0] or not ct[1]: for n,r in enumerate(ct): if not r: print 'Could not find {} table.'.format(required[n]) print ('Before calling this function, call user_gt_tables with ' 'path of source database to create necessary tables.') return False if sample: cursderiv.execute('SELECT id FROM sample') sample={c[0] for c in cursderiv.fetchall()} to_do = [('user_genres','genre_popularity')] if tags_too: to_do.append(('user_tags','tag_popularity')) for usertab,poptab in to_do: cursderiv.execute('SELECT user,most_used FROM {}'.format(usertab)) if sample: strings=[s[1] for s in cursderiv.fetchall() if s[0] in sample] else: strings=[s[1] for s in cursderiv.fetchall()] add_data.create_table(cursderiv,poptab) sql=('INSERT INTO {} (string,frequency,rank) ' 'VALUES(?,?,?)'.format(poptab)) cursderiv.executemany(sql,add_ranks(strings,1)) connderiv.commit()
def gt_tables(db_source): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() for colsourc,table in [('genre','genres'),('tag_list','tags')]: create_gt_table(curssourc,cursderiv,colsourc,table) connderiv.commit()
def copy_tables_across(db_source): connsourc, connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() copy_sample_table(curssourc, cursderiv) copy_and_process_tracks_table(curssourc, cursderiv) create_uploaders_table(cursderiv) create_sample_uploaders_table(cursderiv) connderiv.commit()
def copy_tables_across(db_source): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() copy_sample_table(curssourc,cursderiv) copy_and_process_tracks_table(curssourc,cursderiv) create_uploaders_table(cursderiv) create_sample_uploaders_table(cursderiv) connderiv.commit()
def go( data, folls, db_path='/Users/danielallington/Documents/Research/Electronic_value/data/rand_samp_150k_tracks_boost2.sqlite', output_path='/Users/danielallington/Documents/Research/Electronic_value/data/rs_150_stats/cleaned' ): connsourc, connderiv = deriv_db.connect_databases(db_path) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() print 'Getting sample...' samp = sample(curssourc) print 'Got sample. Separating out users...' # data=user_data(curssourc) uplo = uploaders(data) samp_uplo = samp & uplo print 'Separated out. Figuring out who\'s following who...' foll_samp = {f[1] for f in folls if f[0] in samp} foll_samp_uplo = {f[1] for f in folls if f[0] in samp_uplo} print 'Figured. Ready to go...' print 'Doing overall stats...' output_stats(vital_stats(data, samp, samp_uplo, foll_samp, foll_samp_uplo), output_path, 'basic') print 'Losing unnecessary data...' del (uplo) del (folls) del (foll_samp) del (foll_samp_uplo) data = {d for d in data if d[0] in samp} print 'Lost. Now doing breakdown by city, country, and genre...' for str1, func in [('cities', cities), ('countries', countries)]: for str2, restrict in [('sample', samp), ('sample_uploaders', samp_uplo)]: filtered_data = func(data) fn = '{}_{}'.format(str2, str1) save_data(filtered_data, output_path, fn) output_stats(choice_stats(filtered_data, restrict), output_path, fn) filtered_data = genres(data, cursderiv, samp_uplo) save_data(filtered_data, output_path, 'sample_uploaders_genres') output_stats(choice_stats(filtered_data, samp_uplo), output_path, 'sample_uploaders_genres') print 'Done.'
def gt_tables(db_source,sample_only=False): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() if sample_only: cursderiv.execute('SELECT id FROM sample') users=set(curssourc.fetchall()) else: users=None for colsourc,table in [('genre','genres'),('tag_list','tags')]: create_gt_table(cursderiv,colsourc,table,users) connderiv.commit()
def user_gt_tables(db_source, sample_only=False, tags_too=False): connsourc, connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() required = ['sample', 'tracks', 'uploaders', 'genres', 'tags'] if False in check_tables(cursderiv, required): return False if sample_only: cursderiv.execute('SELECT id FROM sample') users = cursderiv.fetchall() cursderiv.execute('SELECT id FROM uploaders') users = list(set(users).intersection(set(cursderiv.fetchall()))) else: cursderiv.execute('SELECT id FROM uploaders') users = cursderiv.fetchall() print '{} users to process'.format(len(users)) to_do = [('genre', 'user_genres', 'genres')] if tags_too: to_do.append(('tag_list', 'user_tags', 'tags')) for colsourc, tabderiv, ranktable in to_do: print 'Now working with: ' + ranktable add_data.create_table(cursderiv, tabderiv) print 'Fresh {} table created.'.format(colsourc) print 'Getting track data.' tracks = {} sql = 'SELECT user_id,{} FROM tracks'.format(colsourc) for t in cursderiv.execute(sql): l = split_gt_string(t[1]) if l[0]: try: tracks[t[0]].extend(l) except KeyError: tracks[t[0]] = l print 'Data loaded in memory.' done = 0 while done < len(users): to_collect = (user_batch if done + user_batch <= len(users) else len(users) - done) this_batch = users[done:done + to_collect] print 'Starting on a batch of {} users.'.format(to_collect) add_data.insert_deriv_data( cursderiv, tabderiv, deriv_user_data(tracks, cursderiv, this_batch, colsourc, ranktable)) connderiv.commit() done += to_collect print '{} done. {} remain.'.format(done, len(users) - done) return True
def gt_tables(db_source, sample_only=False): connsourc, connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() if sample_only: cursderiv.execute('SELECT id FROM sample') users = set(curssourc.fetchall()) else: users = None for colsourc, table in [('genre', 'genres'), ('tag_list', 'tags')]: create_gt_table(cursderiv, colsourc, table, users) connderiv.commit()
def user_gt_tables(db_source, sample_only=False,tags_too=False): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() required=['sample','tracks','uploaders','genres','tags'] if False in check_tables(cursderiv,required): return False if sample_only: cursderiv.execute('SELECT id FROM sample') users=cursderiv.fetchall() cursderiv.execute('SELECT id FROM uploaders') users=list(set(users).intersection(set(cursderiv.fetchall()))) else: cursderiv.execute('SELECT id FROM uploaders') users=cursderiv.fetchall() print '{} users to process'.format(len(users)) to_do = [('genre','user_genres','genres')] if tags_too: to_do.append(('tag_list','user_tags','tags')) for colsourc,tabderiv,ranktable in to_do: print 'Now working with: '+ranktable add_data.create_table(cursderiv,tabderiv) print 'Fresh {} table created.'.format(colsourc) print 'Getting track data.' tracks={} sql='SELECT user_id,{} FROM tracks'.format(colsourc) for t in cursderiv.execute(sql): l=split_gt_string(t[1]) if l[0]: try: tracks[t[0]].extend(l) except KeyError: tracks[t[0]]=l print 'Data loaded in memory.' done=0 while done < len(users): to_collect = (user_batch if done+user_batch <= len(users) else len(users)-done) this_batch=users[done:done+to_collect] print 'Starting on a batch of {} users.'.format(to_collect) add_data.insert_deriv_data(cursderiv,tabderiv, deriv_user_data(tracks, cursderiv,this_batch, colsourc,ranktable)) connderiv.commit() done+=to_collect print '{} done. {} remain.'.format(done,len(users)-done) return True
def add_comment_data(db_source): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc=connsourc.cursor() cursderiv=connderiv.cursor() corpus_table(cursderiv) ids = get_comment_ids(curssourc) print 'Getting to work on {} comments...'.format(len(ids)) for n,comment in enumerate(comment_data(curssourc,ids)): add_comment_datum(cursderiv,comment) if n == 100: print 'Done the first hundred!' if n % 1000 == 0: connderiv.commit() print 'Committed to db: '+str(n) connderiv.commit() print 'Committed to db: '+str(len(ids))
def add_comment_data(db_source): connsourc, connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() corpus_table(cursderiv) ids = get_comment_ids(curssourc) print 'Getting to work on {} comments...'.format(len(ids)) for n, comment in enumerate(comment_data(curssourc, ids)): add_comment_datum(cursderiv, comment) if n == 100: print 'Done the first hundred!' if n % 1000 == 0: connderiv.commit() print 'Committed to db: ' + str(n) connderiv.commit() print 'Committed to db: ' + str(len(ids))
def add_fave_relations(db_source): connsourc, connderiv = deriv_db.connect_databases(db_source) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() faves_table(cursderiv) unaccounted = 0 done = 0 for fave in fave_relations(curssourc, get_faves(curssourc)): if fave[1]: update_fave_table(cursderiv, fave) done += 1 if done % 1000 == 0: connderiv.commit() print 'Done: ' + str(done) connderiv.commit() print 'Done: ' + str(done)
def add_fave_relations(db_source): connsourc,connderiv = deriv_db.connect_databases(db_source) curssourc=connsourc.cursor() cursderiv=connderiv.cursor() faves_table(cursderiv) unaccounted=0 done=0 for fave in fave_relations(curssourc,get_faves(curssourc)): if fave[1]: update_fave_table(cursderiv,fave) done+=1 if done%1000==0: connderiv.commit() print 'Done: '+str(done) connderiv.commit() print 'Done: '+str(done)
def prepare( db_path='/Users/danielallington/Documents/Research/Electronic_value/data/ten_users.sqlite', output_path='/Users/danielallington/Documents/Research/Electronic_value/data/rs_stats_test' ): connsourc, connderiv = deriv_db.connect_databases(db_path) curssourc = connsourc.cursor() cursderiv = connderiv.cursor() print 'Getting data...' data = sample(curssourc) print 'Got sample.' save_data(data, output_path, 'sample') print 'Getting user data...' data = user_data(curssourc) print 'Got user data.' save_data(data, output_path, 'user_data') print 'Getting follows...' data = follows(curssourc) print 'Got follows.' save_data(data, output_path, 'follow_data') print 'Okay, we\'re done.'