def constructGraph( conn, p, song_features_table_name, artist_similarities_table_name, graph_table_name, debug_print=False ): cur = conn.cursor() MX_common.resetTable( conn, graph_table_name, ['targeto text', 'similaro text, dist real'] ) # read songs_list q = "SELECT \ song_id \ FROM "\ +song_features_table_name +";" cur.execute(q) songs_list = cur.fetchall() node_stats = dict() node_stats['neighbors_cur_artist'] = list() node_stats['neighbors_topo'] = list() node_stats['neighbors_metric'] = list() node_stats['mean_dist'] = list() node_stats['similar_artists'] = list() edge_stats = dict() edge_stats['dist'] = list() edge_stats['vdist'] = list() # for each song (the focal song), find its neighbors for focal_song_id in songs_list[::1]: focal_song_id = focal_song_id[0] if debug_print: print "focal_song: " + focal_song_id # fetch all focal song's features q = "SELECT \ * \ FROM "\ +song_features_table_name+" "\ "WHERE \ song_id='" + focal_song_id + "';" cur.execute(q) focal_song_f = cur.fetchall() # find all songs that are TOPOLOGICALLY close (based on artist similarity): # find all song of current artist q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"artist_id='" + focal_song_f[0][p['invkey']['artist_id']] + "';" cur.execute(q) song_ids_cur_artist = cur.fetchall() node_stats['neighbors_cur_artist'].append(len(song_ids_cur_artist)) if debug_print: for song_id in song_ids_cur_artist: print "same artist songs: " + song_id[0] # find all artists similar to current artist q = "SELECT "\ +"similaro "\ +"FROM "\ +artist_similarities_table_name+" "\ +"WHERE "\ +"targeto='" + focal_song_f[0][p['invkey']['artist_id']] + "';" cur.execute(q) artist_ids_topo = cur.fetchall() node_stats['similar_artists'].append(len(artist_ids_topo)) song_ids_topo = list() # add all song IDs of the similar artists for artist_id in artist_ids_topo: # fetch all songs of this artist q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"artist_id='" + artist_id[0] + "';" cur.execute(q) cur_song_ids = cur.fetchall() # if len(cur_song_ids)>0: # cur_song_ids = cur_song_ids[0] for song_id in cur_song_ids: song_ids_topo.append(song_id) # stage 2 - goo deeper into the tree of artist_similarities. This requires holding a fifo of nodes / some sort of recursion node_stats['neighbors_topo'].append(len(song_ids_topo)) if debug_print: for song_id in song_ids_topo: print "similar artists songs: " + song_id[0] # find all songs that are METRICALLY close (based on song features) song_ids_metric = list() for key in p['relevant_features']: key_range = ( focal_song_f[key] - p['unit_vec'][key]*p['construct_neighborhood_metric_diameter'] , focal_song_f[key] + ) q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +key+ ">="+ str(key_range[0]) + " AND "+ key+ "<="+ str(key_range[1])+";" cur.execute(q) cur_song_ids = cur.fetchall() for id in cur_song_ids[0]: song_ids_metric.append(id) node_stats['neighbors_metric'].append(len(song_ids_metric)) # combine all potential neighbors song IDs song_ids_combined = list(set(song_ids_cur_artist) | set(song_ids_topo) | set(song_ids_metric)) # for each potential neighbor, measure its distance from the focal point potential_neighbors = list() for neighbor_song_id in song_ids_combined: # print "song id: "+neighbor_song_id # fetch features for each potential neighbor q = "SELECT \ *\ FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"song_id='" + neighbor_song_id[0] + "';" cur.execute(q) neighbor_song_f = cur.fetchall() # calculate distance to current song sdist = MX_traverse.calcDistance(focal_song_f[0], neighbor_song_f[0], p['distance_type_construct']) # filter songs that are too distant if sdist[0] < p['maximal_overall_distance']: potential_neighbors.append((neighbor_song_id[0],sdist[0], sdist[1])) # sort potential neighbors by distance # potential_neighbors = sorted(potential_neighbors, key=lambda x: x[1]) if len(potential_neighbors)>0: node_stats['mean_dist'].append( np.mean([x[1] for x in potential_neighbors]) ) else: node_stats['mean_dist'].append(-1) # add potential neighbors and truncate, if there are too many, based on distance for neighborhood_type in neighboorhoodtypes: for neighbor in potential_neighbors[neighborhood_type][0:min(max_MXG_neighbors[neighborhood_type], len(potential_neighbors[neighborhood_type]))]: # print "n0 " + neighbor[0] # print "n1 " + str(neighbor[1]) q = "INSERT INTO "\ +graph_table_name+" \ (targeto, similaro, dist) \ VALUES \ ('"+ focal_song_id +"', '"+ neighbor[0] +"', '"+ str(neighbor[1]) +"');" cur.execute(q) q = "INSERT INTO "\ +graph_table_name+" \ (targeto, similaro, dist) \ VALUES \ ('"+ neighbor[0] +"', '"+ focal_song_id +"', '"+ str(neighbor[1]) +"');" cur.execute(q) edge_stats['dist'].append( neighbor[1] ) edge_stats['vdist'].append( neighbor[2] ) # MXG_target.append( song_id[0] ) # MXG_source.append( potential_neighbors[0] ) return node_stats, edge_stats
def constructGraph( song_features_table_name, artist_similarities_table_name, graph_table_name, debug_print=False ): #graph_song_list_table_name, global p SFstats = MX_common.initUnitVec( song_features_table_name ) g_conn = psy.connect( MX_common.g_db_conn_command ) g_conn.autocommit = True g_cur = g_conn.cursor() # read song_id list q = "SELECT \ song_id \ FROM "\ +song_features_table_name +";" g_cur.execute(q) songs_list = g_cur.fetchall() node_stats = dict() node_stats['neighbors_cur_artist'] = list() node_stats['neighbors_topo'] = list() node_stats['neighbors_metric'] = list() node_stats['mean_dist'] = list() node_stats['similar_artists'] = list() edge_stats = dict() edge_stats['dist'] = list() edge_stats['vdist'] = list() MX_common.resetTable( g_cur, graph_table_name, ['targeto text', 'similaro text'] ) # for each song (the focal song), find its neighbors for focal_song_id in songs_list[::1]: #::100 focal_song_id = focal_song_id[0] if debug_print: print "focal_song: " + focal_song_id # fetch all focal song's features q = "SELECT \ * \ FROM "\ +song_features_table_name+" "\ "WHERE \ song_id='" + focal_song_id + "';" g_cur.execute(q) focal_song_f = g_cur.fetchall() # find all songs that are TOPOLOGICALLY close (based on artist similarity): potential_neighbors = dict() potential_neighbors_dist = dict() nbkey = 'topo' if nbkey in p['neighborhood_types']: potential_neighbors[nbkey] = list() # find all songs of current artist q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"artist_id='" + focal_song_f[0][p['invkey']['artist_id']] + "';" g_cur.execute(q) song_ids_cur_artist = g_cur.fetchall() song_ids_cur_artist = [x[0] for x in song_ids_cur_artist] node_stats['neighbors_cur_artist'].append(len(song_ids_cur_artist)) if debug_print: for song_id in song_ids_cur_artist: print "same artist songs: " + song_id[0] # find all artists similar to current artist q = "SELECT "\ +"similaro "\ +"FROM "\ +artist_similarities_table_name+" "\ +"WHERE "\ +"targeto='" + focal_song_f[0][p['invkey']['artist_id']] + "';" g_cur.execute(q) artist_ids_topo = g_cur.fetchall() node_stats['similar_artists'].append(len(artist_ids_topo)) song_ids_topo = list() # add all songs of similar artists for artist_id in artist_ids_topo: # fetch all songs of this artist q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"artist_id='" + artist_id[0] + "';" g_cur.execute(q) cur_song_ids = g_cur.fetchall() for song_id in cur_song_ids: song_ids_topo.append(song_id[0]) # TBD - goo deeper into the tree of artist_similarities. This requires holding a fifo of nodes / some sort of recursion node_stats['neighbors_topo'].append(len(song_ids_topo)) if debug_print: for song_id in song_ids_topo: print "similar artists songs: " + song_id[0] # combine all potential TOPOLOGICAL neighbors potential_neighbors[nbkey] = list(set(song_ids_cur_artist) | set(song_ids_topo)) # find all songs that are METRICALLY close (based on song features) nbkey = 'features' if nbkey in p['neighborhood_types']: potential_neighbors[nbkey] = list() for key in p['relevant_features']: radius = abs(p['unit_vec_inv'][p['invkey'][key]]) * p['construct_neighborhood_metric_diameter'] key_range = ( focal_song_f[0][p['invkey'][key]] - radius , focal_song_f[0][p['invkey'][key]] + radius ) q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +key+ ">="+ str(key_range[0]) + " AND "+ key+ "<="+ str(key_range[1]) + " "\ +"ORDER BY \ RANDOM() \ LIMIT "\ + str(p['max_feature_neighborhood_size']) +";" g_cur.execute(q) cur_song_ids = g_cur.fetchall() for id in cur_song_ids: potential_neighbors[nbkey].append(id[0]) node_stats['neighbors_metric'].append(len(potential_neighbors[nbkey])) # for each potential neighbor, measure its distance from the focal point for nbkey in p['neighborhood_types']: potential_neighbors_dist[nbkey] = list() for neighbor_song_id in potential_neighbors[nbkey]: # fetch features for each potential neighbor q = "SELECT \ *\ FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"song_id='" + neighbor_song_id + "';" g_cur.execute(q) neighbor_song_f = g_cur.fetchall() # calculate distance to current song sdist = MX_common.calcDistance(focal_song_f[0], neighbor_song_f[0], p['distance_type_construct']) potential_neighbors_dist[nbkey].append((neighbor_song_id,sdist[0], sdist[1])) # sort potential neighbors by distance potential_neighbors_dist[nbkey] = sorted(potential_neighbors_dist[nbkey], key=lambda x: x[1]) if len(potential_neighbors_dist[nbkey])>0: node_stats['mean_dist'].append( np.mean([x[1] for x in potential_neighbors_dist[nbkey]]) ) else: node_stats['mean_dist'].append(-1) print "#Feature NBs = " + str(len(potential_neighbors_dist[nbkey])) + ", Topo same artist NBs = " + str(len(song_ids_cur_artist)) + ", Topo similar artists NBs = " + str(len(song_ids_topo)) # add potential neighbors and truncate, if there are too many, based on distance for nbkey in p['neighborhood_types']: for neighbor in potential_neighbors_dist[nbkey][0:min(p['max_MXG_neighbors'][nbkey], len(potential_neighbors_dist[nbkey]))]: # add both direction of the edge (later on we call a function that filters duplicate edges) q = "INSERT INTO "\ +graph_table_name+" \ (targeto, similaro) \ VALUES \ ('"+ focal_song_id +"', '"+ neighbor[0] +"');" g_cur.execute(q) q = "INSERT INTO "\ +graph_table_name+" \ (targeto, similaro) \ VALUES \ ('"+ neighbor[0] +"', '"+ focal_song_id +"');" g_cur.execute(q) edge_stats['dist'].append( neighbor[1] ) edge_stats['vdist'].append( neighbor[2] ) g_conn.close() return SFstats, node_stats, edge_stats