def generate_demo_network_raga_recognition(network_file, community_file, output_network, colors = cons_net.colors, colorify = True, mydatabase = ''): """ This function generates a network used as a demo for demonstrating relations between phrases. The configuration used to generate this network should ideally be the one that is used for the raga recognition task reported in the paper. """ #loading the network full_net = nx.read_pajek(network_file) #loading community data comm_data = json.load(open(community_file,'r')) #loading all the phrase data comm_char.fetch_phrase_attributes(comm_data, database = mydatabase, user= '******') #getting all the communities from which we dont want any node in the graph in the demo #obtaining gamaka communities gamaka_comms = comm_char.find_gamaka_communities(comm_data)[0] #obtaining communities with only phrases from one mbid one_mbid_comms = comm_char.get_comm_1MBID(comm_data) #collect phrases which should be removed from the graph phrases = [] for c in gamaka_comms: for n in comm_data[c]: phrases.append(int(n['nId'])) for c in one_mbid_comms: for n in comm_data[c]: phrases.append(int(n['nId'])) print len(phrases) #removing the unwanted phrases full_net = raga_recog.remove_nodes_graph(full_net, phrases) # colorify the nodes according to raga labels if colorify: cmd1 = "select raagaId from file where id = (select file_id from pattern where id =%d)" con = psy.connect(database='ICASSP2016_10RAGA_2S', user='******') cur = con.cursor() for n in full_net.nodes(): cur.execute(cmd1%(int(n))) ragaId = cur.fetchone()[0] full_net.node[n]['color'] = ragaId #saving the network nx.write_gexf(full_net, output_network)
def generate_artificially_connected_network(network_file, community_file, output_network, colorify = True, mydatabase = ''): """ Since isolated communities belonging to different ragas are scattered and jumbled up, we attempt to connect them artificaially so that they are all grouped together. """ #loading the network full_net = nx.read_pajek(network_file) #loading community data comm_data = json.load(open(community_file,'r')) #loading all the phrase data comm_char.fetch_phrase_attributes(comm_data, database = mydatabase, user= '******') #getting all the communities from which we dont want any node in the graph in the demo #obtaining gamaka communities gamaka_comms = comm_char.find_gamaka_communities(comm_data)[0] #obtaining communities with only phrases from one mbid one_mbid_comms = comm_char.get_comm_1MBID(comm_data) print len(full_net.nodes()), len(full_net.edges()) #collect phrases which should be removed from the graph phrases = [] for c in gamaka_comms: for n in comm_data[c]: phrases.append(int(n['nId'])) for c in one_mbid_comms: for n in comm_data[c]: phrases.append(int(n['nId'])) print len(phrases) #removing the unwanted phrases full_net = raga_recog.remove_nodes_graph(full_net, phrases) print len(full_net.nodes()), len(full_net.edges()) #lets remove these phrases from the comm_data as well for g in gamaka_comms: comm_data.pop(g) for o in one_mbid_comms: comm_data.pop(o) #obtaining the raga labels for each community (majority voting ofcourse) comm_raga = {} raga_comm = {} node_cnt = 0 for comId in comm_data.keys(): ragaIds = [r['ragaId'] for r in comm_data[comId]] raga_hist, raga_names = comm_char.get_histogram_sorted(ragaIds) comm_raga[comId] = raga_names[0] if not raga_comm.has_key(raga_names[0]): raga_comm[raga_names[0]] = [] raga_comm[raga_names[0]].append(comId) edge_list = [] for comId in comm_data.keys(): raga = comm_raga[comId] node_cnt+= len(comm_data[comId]) for comms_in_raga in raga_comm[raga]: if comms_in_raga == comId: continue #full_net.add_edge(comm_data[comId][0]['nId'], comm_data[comms_in_raga][0]['nId'], weight=0.0) edge_list.append((str(comm_data[comId][0]['nId']), str(comm_data[comms_in_raga][0]['nId']), 0.000000001)) print node_cnt print len(full_net.nodes()), len(full_net.edges()) json.dump(full_net.nodes(), open('pehle.json','w')) full_net.add_weighted_edges_from(edge_list) json.dump(full_net.nodes(), open('baad.json','w')) print len(full_net.nodes()), len(full_net.edges()) # colorify the nodes according to raga labels if colorify: cmd1 = "select raagaId from file where id = (select file_id from pattern where id =%d)" con = psy.connect(database=mydatabase, user='******') cur = con.cursor() for n in full_net.nodes(): cur.execute(cmd1%(int(n))) ragaId = cur.fetchone()[0] full_net.node[n]['color'] = ragaId #saving the network nx.write_gexf(full_net, output_network)
def raga_recognition_V2(out_dir, scratch_dir, fileListFile, thresholdBin, pattDistExt, network_wght_type=-1, force_build_network=0, feature_type='tf-idf', pre_processing=-1, norm_tfidf=None, smooth_idf=False, classifier=('nbMulti', "default"), n_expts=10, var1=True, var2=True, myDatabase='', myUser='', type_eval=("kStratFoldCrossVal", 10), balance_classes=1): """ Raga recognition system using document classification and topic modelling techniques. In this approach we treat phrases of a recording as words (basically cluster/community id). phrases->communities(Words)->word frequencies per file->tf-idf kind of features->classification Input: fileListFile: file which lists all the files to be considered for this anlyasis (there is relevant data extracted and stored in a structed manner from these files) thresholdBin: distance threshold (in bins) which is applied the network pattDistExt: extension of the file that stores pattern distances n_fold: number of cross fold validations force_build_network: if 0 the network is not rebuild if it exists on the disk, if 1 its built again no matter what feature_type: the type of feature to be used for the classification. Options are 'tf': term frequency 'tp': term presence (binary value to indicate if the term is present or not) 'tf-idf': the typicall term frequency * inverse document frequency 'tf-idf_pp1': this is normal tf-idf but with a preprocessing to explicitely remove crappy phrases (gamakas). This is like removing stop words from word count computation. 'tf-idf_pp2': Along with the gamaka phrases (communities), we also remove (as stop word) the communities which are consists of only one mbid. classifier: the classifier to be used for the classification task. Options are: 'NB': Naive naive bayes 'SVM': support vector machines 'SGD': svms with SGD training. Somewhere it was recommended for text classification. pre_processing: -1 for no preprocessing 1 for removing gamaka communities from the analysis (treating them as stop words) NOTE: It feels like this should be taken care of by the IDF computation, but for a small corpus if there is a lot of frequency, the weight is high no matter what. Just to try it out, brain worms!! 2: for removing communities which have only one mbid in them. 3: for removing communities for option 1 and 2 norm_feature: Normalize the final feature vector or not (NOTE: when its on the result seems to be affected a lot by the presence of the gamaka communities) network_wght_type: the schema used for weighting edges of the network. Either 0, 1 or -1 for unity weight """ if not os.path.isdir(out_dir): os.makedirs(out_dir) # path to store all the temporary files #scratch_dir = '/home/sankalp/Work/Work_PhD/library_pythonnew/networkAnalysis/scratch_raga_recognition' fileListFile_basename = os.path.basename(fileListFile) root_filename = os.path.join( scratch_dir, 'network' + '_' + str(fileListFile_basename.replace('.', '_')) + '_' + myDatabase + '_' + str(thresholdBin) + '_' + pattDistExt.replace('.', '')) #root_filename = os.path.join(scratch_dir, 'network'+'_'+ myDatabase+'_'+str(thresholdBin)+'_'+pattDistExt.replace('.','')) #constructing the network t1 = time.time() wghtd_graph_filename = root_filename + '.net' #building network only when its not already present on the disk if force_build_network or not os.path.isfile(wghtd_graph_filename): cons_net.constructNetwork_Weighted_NetworkX( fileListFile, wghtd_graph_filename, thresholdBin, pattDistExt, network_wght_type, -1) # we do not do any significance filtering #reading the network on the disk (either build in the current call or from previous ones) full_net = nx.read_pajek(wghtd_graph_filename) #performing community detection on the built network comm_filename = root_filename + '.community' net_pro.detectCommunitiesInNewtworkNX(wghtd_graph_filename, comm_filename) #fetching relevant data for the community (raga names and file names to analysis) comm_data = json.load(open(comm_filename, 'r')) comm_char.fetch_phrase_attributes(comm_data, database=myDatabase, user=myUser) #since all the text mining tools consider 0-10 integers as stop words, for making system robust to any unexpected hiccups # all the the comm_ids are mapped to uuids com_id_2_uuid = {} for com_id in comm_data.keys(): com_id_2_uuid[com_id] = uuid.uuid1().hex #getting per document (recording) words (community index, phrase instanes) per_rec_data = get_per_recording_data(comm_data) t2 = time.time() print "time taken = %f" % (t2 - t1) ##########Loop for N_Fold cross validataion############## raga_mbid = get_mbids_raagaIds_for_collection(fileListFile, myDatabase, myUser) raga_list = [r[0] for r in raga_mbid] mbid_list = [r[1] for r in raga_mbid] raga_map, map_raga = generate_raga_mapping(raga_list) label_list = np.array([raga_map[r] for r in raga_list]) #if there has to be a pre-processing done to remove specific communities, estimating them to put them as stop words stop_words = [] if pre_processing == 1: stop_words.extend( comm_char.find_gamaka_communities( comm_data, max_mbids_per_comm=label_list.size / np.unique(label_list).size)[0]) if pre_processing == 2: stop_words.extend(comm_char.get_comm_1MBID(comm_data)) if pre_processing == 3: stop_words.extend( comm_char.find_gamaka_communities( comm_data, max_mbids_per_comm=label_list.size / np.unique(label_list).size)[0]) stop_words.extend(comm_char.get_comm_1MBID(comm_data)) stop_words = [com_id_2_uuid[s] for s in stop_words] accuracy_var1 = -1 if var1: ########################### Performing cross fold train testing Variant 1 ################################### #In this variant for each fold we generate a training tf-idf vector. Meaning, vocabulary for each fold is solely determined by the training examples# #initializing crossfold object if type_eval[0] == 'kStratFoldCrossVal': cval = cross_val.StratifiedKFold( label_list, n_folds=type_eval[1], shuffle=True, random_state=np.random.randint(100)) elif type_eval[0] == 'kFoldCrossVal': cval = cross_val.KFold(len(label_list), n_folds=type_eval[1], shuffle=True, random_state=np.random.randint(100)) elif type_eval[0] == 'LeaveOneOut': cval = cross_val.LeaveOneOut(len(label_list)) mlObj_var1 = ml.experimenter() sca = preprocessing.StandardScaler() # arrays for storing predicted labels and their names label_list_pred = -1 * np.ones(label_list.shape) predicted_raga = ['' for r in range(len(raga_mbid)) ] #placeholder for storing predicted ragas #starting crossfold validation loop (NOTE: in this variant we only perform a single experiment) for mm, (train_inds, test_inds) in enumerate(cval): print "Processing fold %d\n" % mm #initializers needed for analysis of words (community indexes), WE DO IT IN EVERY FOLD TO MAK SURE EVERYTHING FROM THE PREV FOLD IS REMOVED count_vect = CountVectorizer(stop_words=stop_words) tfidf_transformer = TfidfTransformer(norm=norm_tfidf, smooth_idf=smooth_idf) docs_train = [] #storing documents (phrases per recording) #preparing tf-idf matrix for the training data for train_ind in train_inds: if per_rec_data.has_key( mbid_list[train_ind] ): #not every file has phrases found!! (there is one stupid file for which there are no phrases within this distance threshold) per_rec_words = ' '.join([ com_id_2_uuid[p[0]] for p in per_rec_data[mbid_list[train_ind]] ]) else: per_rec_words = '' docs_train.append(per_rec_words) #Computing term frequencies (training set) Our vocab is only learned from training examples X_train_counts = count_vect.fit_transform(docs_train) if feature_type == 'tf': features_train = X_train_counts.toarray() elif feature_type == 'tp': features_train = X_train_counts.toarray() features_train = np.where(features_train >= 1, 1, features_train) elif feature_type == 'tf-idf': #computing features from term frequencies (training set) features_train = tfidf_transformer.fit_transform( X_train_counts) features_train = features_train.toarray() else: print "Please specify a valid feature type" return False #checking the input classifier params if not isinstance(classifier[1], dict): classifier_params = {} else: classifier_params = classifier[1] #training the model with the obtained tf-idf features if not mlObj_var1.skl_classifiers.has_key(classifier[0]): print "Please provide a valid clsasifier name" return False clf = mlObj_var1.skl_classifiers[classifier[0]]['handle']( **classifier_params) if mlObj_var1.skl_classifiers[classifier[0]]['norm_feat_req']: features_train = sca.fit_transform( features_train.astype(np.float)) clf.fit(features_train, label_list[train_inds]) docs_test = [] #preparing the tf-idf matrix for the testing data. for test_ind in test_inds: if per_rec_data.has_key(mbid_list[test_ind]): per_rec_words = ' '.join([ com_id_2_uuid[p[0]] for p in per_rec_data[mbid_list[test_ind]] ]) else: per_rec_words = '' docs_test.append(per_rec_words) #Computing term frequencies (testing set) X_test_counts = count_vect.transform(docs_test) if feature_type == 'tf': features_test = X_test_counts.toarray() elif feature_type == 'tp': features_test = X_test_counts.toarray() features_test = np.where(features_test >= 1, 1, features_test) elif feature_type == 'tf-idf': #computing features from term frequencies (training set) features_test = tfidf_transformer.transform(X_test_counts) features_test = features_test.toarray() else: print "Please specify a valid feature type" return False #performing prediction of labels using the trained model if mlObj_var1.skl_classifiers[classifier[0]]['norm_feat_req']: features_test = sca.transform(features_test.astype(np.float)) predicted = clf.predict(features_test) label_list_pred[test_inds] = predicted for ii, pred_val in enumerate(predicted): predicted_raga[test_inds[ii]] = map_raga[pred_val] cnt = 0 for i in range(len(predicted_raga)): if raga_list[i] == predicted_raga[i]: cnt += 1 print "You got %d number of ragas right for a total of %d number of recordings (Variant1)" % ( cnt, len(predicted_raga)) cMTC_var1 = confusion_matrix(label_list, label_list_pred) accuracy_var1 = float(cnt) / float(len(predicted_raga)) ########################## End of variant 1 of cross fold testing ################################## accuracy_var2 = -1 if var2: ########################### Performing cross fold train testing Variant 2 ################################### #In this variant tf-idf vectors are computed for the entire dataset. The only affect this will have is in the computatino of idf term. # This way is actually better because testing files will also contribute to the importance that is given to a work in the idf term. # Since computation of tf-idf is unsupervised (no raga label used) even this variant should be a valid experimental setup. count_vect = CountVectorizer(stop_words=stop_words) tfidf_transformer = TfidfTransformer(norm=norm_tfidf, smooth_idf=False) mlObj = ml.experimenter() mlObj.setExperimentParams( nExp=n_expts, typeEval=type_eval, nInstPerClass=-1, classifier=classifier, balanceClasses=balance_classes ) #Note that only balanced classes option does stratified kfold exp #NOTE: balanceClasses will make sure each fold has equal number of samples from each class. and threre are equal number of feature instances per class docs_train = [ ] # this time we will use all the document in out dataset for ii in range(label_list.size): if per_rec_data.has_key( mbid_list[ii] ): #not every file has phrases found!! (there is one stupid file for which there are no phrases within this distance threshold) per_rec_words = ' '.join( [com_id_2_uuid[p[0]] for p in per_rec_data[mbid_list[ii]]]) else: per_rec_words = '' docs_train.append(per_rec_words) count_all = count_vect.fit_transform(docs_train) features = tfidf_transformer.fit_transform(count_all) if False: dump = { 'features': features.toarray(), 'labels': np.array(label_list) } pickle.dump(dump, open('features_dump_300_Hindustani.pkl', 'w')) return True mlObj.setFeaturesAndClassLabels(features.toarray(), np.array(raga_list)) mlObj.runExperiment() accuracy_var2 = mlObj.overallAccuracy print "You got %d number of ragas right for a total of %d number of recordings (Variant2)" % ( np.round(mlObj.overallAccuracy * len(label_list)), len(label_list)) ########################## End of variant 2 of cross fold testing ################################## ##saving experimental results fid = open(os.path.join(out_dir, 'experiment_results.pkl'), 'w') results = {} if var1: results.update({ 'var1': { 'cm': cMTC_var1, 'gt_label': label_list, 'pred_label': label_list_pred, 'mbids': mbid_list, 'mapping': map_raga, 'accuracy': float(cnt) / len(predicted_raga) } }) if var2: results.update({ 'var2': { 'cm': mlObj.cMTXExp, 'gt_label': mlObj.classLabelsInt, 'pred_label': mlObj.decArray, 'mbids': mbid_list, 'mapping': mlObj.cNames, 'accuracy': mlObj.overallAccuracy, 'pf_accuracy': mlObj.accuracy } }) pickle.dump(results, fid) #also dumping the input params to this function params_input = {} for k in inspect.getargspec(raga_recognition_V2).args: params_input[k] = locals()[k] fid = open(os.path.join(out_dir, 'experiment_params.json'), 'w') json.dump(params_input, fid) return np.max([accuracy_var1, accuracy_var2])