def user_frequency_tables(db_source, sample=True, tags_too=False):
    connsourc, connderiv = deriv_db.connect_databases(db_source)
    cursderiv = connderiv.cursor()

    required = ['user_genres', 'user_tags']
    ct = check_tables(cursderiv, required)
    if not ct[0] or not ct[1]:
        for n, r in enumerate(ct):
            if not r: print 'Could not find {} table.'.format(required[n])
        print(
            'Before calling this function, call user_gt_tables with '
            'path of source database to create necessary tables.')
        return False

    if sample:
        cursderiv.execute('SELECT id FROM sample')
        sample = {c[0] for c in cursderiv.fetchall()}

    to_do = [('user_genres', 'genre_popularity')]
    if tags_too: to_do.append(('user_tags', 'tag_popularity'))

    for usertab, poptab in to_do:
        cursderiv.execute('SELECT user,most_used FROM {}'.format(usertab))
        if sample:
            strings = [s[1] for s in cursderiv.fetchall() if s[0] in sample]
        else:
            strings = [s[1] for s in cursderiv.fetchall()]

        add_data.create_table(cursderiv, poptab)
        sql = ('INSERT INTO {} (string,frequency,rank) '
               'VALUES(?,?,?)'.format(poptab))
        cursderiv.executemany(sql, add_ranks(strings, 1))
        connderiv.commit()
def user_gt_tables(db_source):
    connsourc,connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()

    required=['genres','tags']
    ct = check_tables(cursderiv,required)
    if not ct[0] or not ct[1]:
        for n,r in enumerate(ct):
            if not r: print 'Could not find {} table.'.format(required[n])
        print ('Before calling this function, call gt_tables with '
               'path of source database to create necessary tables.')
        return False

    curssourc.execute('SELECT user_id FROM tracks')
    users=set(curssourc.fetchall())

    for colsourc,tabderiv,ranktable in [('genre','user_genres','genres'),
                                        ('tag_list','user_tags','tags')]:
        print 'Now working with: '+ranktable
        add_data.create_table(cursderiv,tabderiv)
        add_data.insert_deriv_data(cursderiv,tabderiv,
                                   deriv_user_data(curssourc,cursderiv,
                                                   users,colsourc,ranktable))
        connderiv.commit()

    return True
def user_frequency_tables(db_source, sample=True, tags_too=False):
    connsourc,connderiv = deriv_db.connect_databases(db_source)
    cursderiv = connderiv.cursor()

    required=['user_genres','user_tags']
    ct = check_tables(cursderiv,required)
    if not ct[0] or not ct[1]:
        for n,r in enumerate(ct):
            if not r: print 'Could not find {} table.'.format(required[n])
        print ('Before calling this function, call user_gt_tables with '
               'path of source database to create necessary tables.')
        return False

    if sample:
        cursderiv.execute('SELECT id FROM sample')
        sample={c[0] for c in cursderiv.fetchall()}

    to_do = [('user_genres','genre_popularity')]
    if tags_too: to_do.append(('user_tags','tag_popularity'))

    for usertab,poptab in to_do:
        cursderiv.execute('SELECT user,most_used FROM {}'.format(usertab))
        if sample:
            strings=[s[1] for s in cursderiv.fetchall() if s[0] in sample]
        else:
            strings=[s[1] for s in cursderiv.fetchall()]

        add_data.create_table(cursderiv,poptab)
        sql=('INSERT INTO {} (string,frequency,rank) '
             'VALUES(?,?,?)'.format(poptab))
        cursderiv.executemany(sql,add_ranks(strings,1))
        connderiv.commit()
def gt_tables(db_source):
    connsourc,connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()
    for colsourc,table in [('genre','genres'),('tag_list','tags')]:
        create_gt_table(curssourc,cursderiv,colsourc,table)
        connderiv.commit()
def copy_tables_across(db_source):
    connsourc, connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()
    copy_sample_table(curssourc, cursderiv)
    copy_and_process_tracks_table(curssourc, cursderiv)
    create_uploaders_table(cursderiv)
    create_sample_uploaders_table(cursderiv)
    connderiv.commit()
def copy_tables_across(db_source):
    connsourc,connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()
    copy_sample_table(curssourc,cursderiv)
    copy_and_process_tracks_table(curssourc,cursderiv)
    create_uploaders_table(cursderiv)
    create_sample_uploaders_table(cursderiv)
    connderiv.commit()
Example #7
0
def go(
    data,
    folls,
    db_path='/Users/danielallington/Documents/Research/Electronic_value/data/rand_samp_150k_tracks_boost2.sqlite',
    output_path='/Users/danielallington/Documents/Research/Electronic_value/data/rs_150_stats/cleaned'
):
    connsourc, connderiv = deriv_db.connect_databases(db_path)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()

    print 'Getting sample...'

    samp = sample(curssourc)
    print 'Got sample. Separating out users...'
    #    data=user_data(curssourc)

    uplo = uploaders(data)
    samp_uplo = samp & uplo

    print 'Separated out. Figuring out who\'s following who...'
    foll_samp = {f[1] for f in folls if f[0] in samp}
    foll_samp_uplo = {f[1] for f in folls if f[0] in samp_uplo}

    print 'Figured. Ready to go...'

    print 'Doing overall stats...'

    output_stats(vital_stats(data, samp, samp_uplo, foll_samp, foll_samp_uplo),
                 output_path, 'basic')

    print 'Losing unnecessary data...'

    del (uplo)
    del (folls)
    del (foll_samp)
    del (foll_samp_uplo)
    data = {d for d in data if d[0] in samp}

    print 'Lost. Now doing breakdown by city, country, and genre...'

    for str1, func in [('cities', cities), ('countries', countries)]:
        for str2, restrict in [('sample', samp),
                               ('sample_uploaders', samp_uplo)]:
            filtered_data = func(data)
            fn = '{}_{}'.format(str2, str1)
            save_data(filtered_data, output_path, fn)
            output_stats(choice_stats(filtered_data, restrict), output_path,
                         fn)

    filtered_data = genres(data, cursderiv, samp_uplo)
    save_data(filtered_data, output_path, 'sample_uploaders_genres')
    output_stats(choice_stats(filtered_data, samp_uplo), output_path,
                 'sample_uploaders_genres')

    print 'Done.'
def gt_tables(db_source,sample_only=False):
    connsourc,connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()
    if sample_only:
        cursderiv.execute('SELECT id FROM sample')
        users=set(curssourc.fetchall())
    else: users=None
    for colsourc,table in [('genre','genres'),('tag_list','tags')]:
        create_gt_table(cursderiv,colsourc,table,users)
        connderiv.commit()
def user_gt_tables(db_source, sample_only=False, tags_too=False):
    connsourc, connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()

    required = ['sample', 'tracks', 'uploaders', 'genres', 'tags']
    if False in check_tables(cursderiv, required):
        return False

    if sample_only:
        cursderiv.execute('SELECT id FROM sample')
        users = cursderiv.fetchall()
        cursderiv.execute('SELECT id FROM uploaders')
        users = list(set(users).intersection(set(cursderiv.fetchall())))
    else:
        cursderiv.execute('SELECT id FROM uploaders')
        users = cursderiv.fetchall()

    print '{} users to process'.format(len(users))

    to_do = [('genre', 'user_genres', 'genres')]
    if tags_too: to_do.append(('tag_list', 'user_tags', 'tags'))

    for colsourc, tabderiv, ranktable in to_do:
        print 'Now working with: ' + ranktable
        add_data.create_table(cursderiv, tabderiv)
        print 'Fresh {} table created.'.format(colsourc)
        print 'Getting track data.'
        tracks = {}
        sql = 'SELECT user_id,{} FROM tracks'.format(colsourc)
        for t in cursderiv.execute(sql):
            l = split_gt_string(t[1])
            if l[0]:
                try:
                    tracks[t[0]].extend(l)
                except KeyError:
                    tracks[t[0]] = l
        print 'Data loaded in memory.'
        done = 0
        while done < len(users):
            to_collect = (user_batch if done + user_batch <= len(users) else
                          len(users) - done)
            this_batch = users[done:done + to_collect]
            print 'Starting on a batch of {} users.'.format(to_collect)
            add_data.insert_deriv_data(
                cursderiv, tabderiv,
                deriv_user_data(tracks, cursderiv, this_batch, colsourc,
                                ranktable))
            connderiv.commit()
            done += to_collect
            print '{} done. {} remain.'.format(done, len(users) - done)

    return True
def gt_tables(db_source, sample_only=False):
    connsourc, connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()
    if sample_only:
        cursderiv.execute('SELECT id FROM sample')
        users = set(curssourc.fetchall())
    else:
        users = None
    for colsourc, table in [('genre', 'genres'), ('tag_list', 'tags')]:
        create_gt_table(cursderiv, colsourc, table, users)
        connderiv.commit()
def user_gt_tables(db_source, sample_only=False,tags_too=False):
    connsourc,connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()

    required=['sample','tracks','uploaders','genres','tags']
    if False in check_tables(cursderiv,required):
        return False
   
    if sample_only: 
        cursderiv.execute('SELECT id FROM sample')
        users=cursderiv.fetchall()
        cursderiv.execute('SELECT id FROM uploaders')
        users=list(set(users).intersection(set(cursderiv.fetchall())))
    else: 
        cursderiv.execute('SELECT id FROM uploaders')
        users=cursderiv.fetchall()

    print '{} users to process'.format(len(users))

    to_do = [('genre','user_genres','genres')]
    if tags_too: to_do.append(('tag_list','user_tags','tags'))

    for colsourc,tabderiv,ranktable in to_do:
        print 'Now working with: '+ranktable
        add_data.create_table(cursderiv,tabderiv)
        print 'Fresh {} table created.'.format(colsourc)
        print 'Getting track data.'
        tracks={}
        sql='SELECT user_id,{} FROM tracks'.format(colsourc)
        for t in cursderiv.execute(sql):
            l=split_gt_string(t[1])
            if l[0]:
                try:
                    tracks[t[0]].extend(l)
                except KeyError:
                    tracks[t[0]]=l
        print 'Data loaded in memory.'
        done=0
        while done < len(users):
            to_collect = (user_batch if done+user_batch <= len(users)
                          else len(users)-done)
            this_batch=users[done:done+to_collect]
            print 'Starting on a batch of {} users.'.format(to_collect)
            add_data.insert_deriv_data(cursderiv,tabderiv,
                                       deriv_user_data(tracks,
                                                       cursderiv,this_batch,
                                                       colsourc,ranktable))
            connderiv.commit()
            done+=to_collect
            print '{} done. {} remain.'.format(done,len(users)-done)

    return True
def add_comment_data(db_source):
    connsourc,connderiv = deriv_db.connect_databases(db_source)
    curssourc=connsourc.cursor()
    cursderiv=connderiv.cursor()
    corpus_table(cursderiv)
    ids = get_comment_ids(curssourc)
    print 'Getting to work on {} comments...'.format(len(ids))
    for n,comment in enumerate(comment_data(curssourc,ids)):
        add_comment_datum(cursderiv,comment)
        if n == 100: print 'Done the first hundred!'
        if n % 1000 == 0:
            connderiv.commit()
            print 'Committed to db: '+str(n)
    connderiv.commit()
    print 'Committed to db: '+str(len(ids))
Example #13
0
def add_comment_data(db_source):
    connsourc, connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()
    corpus_table(cursderiv)
    ids = get_comment_ids(curssourc)
    print 'Getting to work on {} comments...'.format(len(ids))
    for n, comment in enumerate(comment_data(curssourc, ids)):
        add_comment_datum(cursderiv, comment)
        if n == 100: print 'Done the first hundred!'
        if n % 1000 == 0:
            connderiv.commit()
            print 'Committed to db: ' + str(n)
    connderiv.commit()
    print 'Committed to db: ' + str(len(ids))
Example #14
0
def add_fave_relations(db_source):
    connsourc, connderiv = deriv_db.connect_databases(db_source)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()
    faves_table(cursderiv)
    unaccounted = 0
    done = 0
    for fave in fave_relations(curssourc, get_faves(curssourc)):
        if fave[1]:
            update_fave_table(cursderiv, fave)
            done += 1
            if done % 1000 == 0:
                connderiv.commit()
                print 'Done: ' + str(done)
    connderiv.commit()
    print 'Done: ' + str(done)
def add_fave_relations(db_source):
    connsourc,connderiv = deriv_db.connect_databases(db_source)
    curssourc=connsourc.cursor()
    cursderiv=connderiv.cursor()
    faves_table(cursderiv)
    unaccounted=0
    done=0
    for fave in fave_relations(curssourc,get_faves(curssourc)):
        if fave[1]:
            update_fave_table(cursderiv,fave)
            done+=1
            if done%1000==0: 
                connderiv.commit()
                print 'Done: '+str(done)
    connderiv.commit()
    print 'Done: '+str(done)
Example #16
0
def prepare(
    db_path='/Users/danielallington/Documents/Research/Electronic_value/data/ten_users.sqlite',
    output_path='/Users/danielallington/Documents/Research/Electronic_value/data/rs_stats_test'
):
    connsourc, connderiv = deriv_db.connect_databases(db_path)
    curssourc = connsourc.cursor()
    cursderiv = connderiv.cursor()

    print 'Getting data...'
    data = sample(curssourc)
    print 'Got sample.'
    save_data(data, output_path, 'sample')

    print 'Getting user data...'
    data = user_data(curssourc)
    print 'Got user data.'
    save_data(data, output_path, 'user_data')

    print 'Getting follows...'
    data = follows(curssourc)
    print 'Got follows.'
    save_data(data, output_path, 'follow_data')
    print 'Okay, we\'re done.'