def generate_demo_network_raga_recognition(network_file, community_file, output_network, colors = cons_net.colors, colorify = True, mydatabase = ''):
    This function generates a network used as a demo for demonstrating relations between phrases.
    The configuration used to generate this network should ideally be the one that is used for the
    raga recognition task reported in the paper.
    #loading the network
    full_net = nx.read_pajek(network_file)
    #loading community data
    comm_data = json.load(open(community_file,'r'))
    #loading all the phrase data
    comm_char.fetch_phrase_attributes(comm_data, database = mydatabase, user= '******')
    #getting all the communities from which we dont want any node in the graph in the demo
    #obtaining gamaka communities
    gamaka_comms = comm_char.find_gamaka_communities(comm_data)[0]
    #obtaining communities with only phrases from one mbid
    one_mbid_comms = comm_char.get_comm_1MBID(comm_data)
    #collect phrases which should be removed from the graph
    phrases = []
    for c in gamaka_comms:
        for n in comm_data[c]:
    for c in one_mbid_comms:
        for n in comm_data[c]:
    print len(phrases)
    #removing the unwanted phrases
    full_net = raga_recog.remove_nodes_graph(full_net, phrases)
    # colorify the nodes according to raga labels
    if colorify:
        cmd1 = "select raagaId from file where id = (select file_id from pattern where id =%d)"
        con = psy.connect(database='ICASSP2016_10RAGA_2S', user='******') 
        cur = con.cursor()
        for n in full_net.nodes():
            ragaId = cur.fetchone()[0]
            full_net.node[n]['color'] = ragaId

    #saving the network
    nx.write_gexf(full_net, output_network)
def generate_artificially_connected_network(network_file, community_file, output_network, colorify = True, mydatabase = ''):
    Since isolated communities belonging to different ragas are scattered and jumbled up, we attempt to connect them artificaially
    so that they are all grouped together.
     #loading the network
    full_net = nx.read_pajek(network_file)
    #loading community data
    comm_data = json.load(open(community_file,'r'))
    #loading all the phrase data
    comm_char.fetch_phrase_attributes(comm_data, database = mydatabase, user= '******')
    #getting all the communities from which we dont want any node in the graph in the demo
    #obtaining gamaka communities
    gamaka_comms = comm_char.find_gamaka_communities(comm_data)[0]
    #obtaining communities with only phrases from one mbid
    one_mbid_comms = comm_char.get_comm_1MBID(comm_data)
    print len(full_net.nodes()), len(full_net.edges())
    #collect phrases which should be removed from the graph
    phrases = []
    for c in gamaka_comms:
        for n in comm_data[c]:
    for c in one_mbid_comms:
        for n in comm_data[c]:
    print len(phrases)
    #removing the unwanted phrases
    full_net = raga_recog.remove_nodes_graph(full_net, phrases)
    print len(full_net.nodes()), len(full_net.edges())
    #lets remove these phrases from the comm_data as well
    for g in gamaka_comms:
    for o in one_mbid_comms:
    #obtaining the raga labels for each community (majority voting ofcourse)
    comm_raga = {}
    raga_comm = {}
    node_cnt = 0
    for comId in comm_data.keys():
        ragaIds = [r['ragaId']  for r in comm_data[comId]]
        raga_hist, raga_names = comm_char.get_histogram_sorted(ragaIds)
        comm_raga[comId] = raga_names[0]
        if not raga_comm.has_key(raga_names[0]):
            raga_comm[raga_names[0]] = []

    edge_list = []
    for comId in comm_data.keys():
        raga = comm_raga[comId]
        node_cnt+= len(comm_data[comId])
        for comms_in_raga in raga_comm[raga]:
            if comms_in_raga == comId:
            #full_net.add_edge(comm_data[comId][0]['nId'], comm_data[comms_in_raga][0]['nId'], weight=0.0)
            edge_list.append((str(comm_data[comId][0]['nId']), str(comm_data[comms_in_raga][0]['nId']), 0.000000001))

    print node_cnt
    print len(full_net.nodes()), len(full_net.edges())
    json.dump(full_net.nodes(), open('pehle.json','w'))
    json.dump(full_net.nodes(), open('baad.json','w'))
    print len(full_net.nodes()), len(full_net.edges())
    # colorify the nodes according to raga labels
    if colorify:
        cmd1 = "select raagaId from file where id = (select file_id from pattern where id =%d)"
        con = psy.connect(database=mydatabase, user='******') 
        cur = con.cursor()
        for n in full_net.nodes():
            ragaId = cur.fetchone()[0]
            full_net.node[n]['color'] = ragaId

    #saving the network
    nx.write_gexf(full_net, output_network)
def raga_recognition_V2(out_dir,
                        classifier=('nbMulti', "default"),
                        type_eval=("kStratFoldCrossVal", 10),
    Raga recognition system using document classification and topic modelling techniques.
    In this approach we treat phrases of a recording as words (basically cluster/community id). 
    phrases->communities(Words)->word frequencies per file->tf-idf kind of features->classification
        fileListFile: file which lists all the files to be considered for this anlyasis (there is relevant data extracted and stored in a structed manner from these files)
        thresholdBin: distance threshold (in bins) which is applied the network
        pattDistExt: extension of the file that stores pattern distances
        n_fold: number of cross fold validations
        force_build_network: if 0 the network is not rebuild if it exists on the disk, if 1 its built again no matter what
        feature_type: the type of feature to be used for the classification. Options are
                      'tf': term frequency
                      'tp': term presence (binary value to indicate if the term is present or not)
                      'tf-idf': the typicall term frequency * inverse document frequency
                      'tf-idf_pp1': this is normal tf-idf but with a preprocessing to explicitely remove crappy phrases (gamakas). This is like removing stop words from word count computation.
                      'tf-idf_pp2': Along with the gamaka phrases (communities), we also remove (as stop word) the communities which are consists of only one mbid. 
        classifier: the classifier to be used for the classification task. Options are:
                    'NB': Naive naive bayes
                    'SVM': support vector machines
                    'SGD': svms with SGD training. Somewhere it was recommended for text classification.
        pre_processing: -1 for no preprocessing
                         1 for removing gamaka communities from the analysis (treating them as stop words)
                         NOTE: It feels like this should be taken care of by the IDF computation, but for a small corpus if there is a lot of frequency, the weight is high no matter what. Just to try it out, brain worms!!
                         2: for removing communities which have only one mbid in them. 
                         3: for removing communities for option 1 and 2
        norm_feature: Normalize the final feature vector or not (NOTE: when its on the result seems to be affected a lot by the presence of the gamaka communities)
        network_wght_type: the schema used for weighting edges of the network. Either 0, 1 or -1 for unity weight
    if not os.path.isdir(out_dir):

    # path to store all the temporary files
    #scratch_dir = '/home/sankalp/Work/Work_PhD/library_pythonnew/networkAnalysis/scratch_raga_recognition'
    fileListFile_basename = os.path.basename(fileListFile)
    root_filename = os.path.join(
        scratch_dir, 'network' + '_' +
        str(fileListFile_basename.replace('.', '_')) + '_' + myDatabase + '_' +
        str(thresholdBin) + '_' + pattDistExt.replace('.', ''))
    #root_filename = os.path.join(scratch_dir, 'network'+'_'+ myDatabase+'_'+str(thresholdBin)+'_'+pattDistExt.replace('.',''))

    #constructing the network
    t1 = time.time()
    wghtd_graph_filename = root_filename + '.net'
    #building network only when its not already present on the disk
    if force_build_network or not os.path.isfile(wghtd_graph_filename):
            fileListFile, wghtd_graph_filename, thresholdBin, pattDistExt,
            network_wght_type, -1)  # we do not do any significance filtering

    #reading the network on the disk (either build in the current call or from previous ones)
    full_net = nx.read_pajek(wghtd_graph_filename)

    #performing community detection on the built network
    comm_filename = root_filename + '.community'
    net_pro.detectCommunitiesInNewtworkNX(wghtd_graph_filename, comm_filename)

    #fetching relevant data for the community (raga names and file names to analysis)
    comm_data = json.load(open(comm_filename, 'r'))

    #since all the text mining tools consider 0-10 integers as stop words, for making system robust to any unexpected hiccups
    # all the the comm_ids are mapped to uuids
    com_id_2_uuid = {}
    for com_id in comm_data.keys():
        com_id_2_uuid[com_id] = uuid.uuid1().hex

    #getting per document (recording) words (community index, phrase instanes)
    per_rec_data = get_per_recording_data(comm_data)

    t2 = time.time()
    print "time taken = %f" % (t2 - t1)

    ##########Loop for N_Fold cross validataion##############
    raga_mbid = get_mbids_raagaIds_for_collection(fileListFile, myDatabase,
    raga_list = [r[0] for r in raga_mbid]
    mbid_list = [r[1] for r in raga_mbid]
    raga_map, map_raga = generate_raga_mapping(raga_list)
    label_list = np.array([raga_map[r] for r in raga_list])

    #if there has to be a pre-processing done to remove specific communities, estimating them to put them as stop words
    stop_words = []
    if pre_processing == 1:
                max_mbids_per_comm=label_list.size /
    if pre_processing == 2:
    if pre_processing == 3:
                max_mbids_per_comm=label_list.size /

    stop_words = [com_id_2_uuid[s] for s in stop_words]
    accuracy_var1 = -1
    if var1:
        ########################### Performing cross fold train testing Variant 1 ###################################
        #In this variant for each fold we generate a training tf-idf vector. Meaning, vocabulary for each fold is solely determined by the training examples#

        #initializing crossfold object
        if type_eval[0] == 'kStratFoldCrossVal':
            cval = cross_val.StratifiedKFold(
        elif type_eval[0] == 'kFoldCrossVal':
            cval = cross_val.KFold(len(label_list),
        elif type_eval[0] == 'LeaveOneOut':
            cval = cross_val.LeaveOneOut(len(label_list))

        mlObj_var1 = ml.experimenter()
        sca = preprocessing.StandardScaler()

        # arrays for storing predicted labels and their names
        label_list_pred = -1 * np.ones(label_list.shape)
        predicted_raga = ['' for r in range(len(raga_mbid))
                          ]  #placeholder for storing predicted ragas

        #starting crossfold validation loop (NOTE: in this variant we only perform a single experiment)
        for mm, (train_inds, test_inds) in enumerate(cval):
            print "Processing fold %d\n" % mm

            #initializers needed for analysis of words (community indexes), WE DO IT IN EVERY FOLD TO MAK SURE EVERYTHING FROM THE PREV FOLD IS REMOVED
            count_vect = CountVectorizer(stop_words=stop_words)
            tfidf_transformer = TfidfTransformer(norm=norm_tfidf,

            docs_train = []  #storing documents (phrases per recording)
            #preparing tf-idf matrix for the training data
            for train_ind in train_inds:
                if per_rec_data.has_key(
                ):  #not every file has phrases found!! (there is one stupid file for which there are no phrases within this distance threshold)
                    per_rec_words = ' '.join([
                        for p in per_rec_data[mbid_list[train_ind]]
                    per_rec_words = ''

            #Computing term frequencies (training set) Our vocab is only learned from training examples
            X_train_counts = count_vect.fit_transform(docs_train)

            if feature_type == 'tf':
                features_train = X_train_counts.toarray()
            elif feature_type == 'tp':
                features_train = X_train_counts.toarray()
                features_train = np.where(features_train >= 1, 1,
            elif feature_type == 'tf-idf':
                #computing features from term frequencies (training set)
                features_train = tfidf_transformer.fit_transform(
                features_train = features_train.toarray()
                print "Please specify a valid feature type"
                return False

            #checking the input classifier params
            if not isinstance(classifier[1], dict):
                classifier_params = {}
                classifier_params = classifier[1]
            #training the model with the obtained tf-idf features
            if not mlObj_var1.skl_classifiers.has_key(classifier[0]):
                print "Please provide a valid clsasifier name"
                return False

            clf = mlObj_var1.skl_classifiers[classifier[0]]['handle'](

            if mlObj_var1.skl_classifiers[classifier[0]]['norm_feat_req']:
                features_train = sca.fit_transform(

  , label_list[train_inds])

            docs_test = []
            #preparing the tf-idf matrix for the testing data.
            for test_ind in test_inds:
                if per_rec_data.has_key(mbid_list[test_ind]):
                    per_rec_words = ' '.join([
                        for p in per_rec_data[mbid_list[test_ind]]
                    per_rec_words = ''

            #Computing term frequencies (testing set)
            X_test_counts = count_vect.transform(docs_test)

            if feature_type == 'tf':
                features_test = X_test_counts.toarray()
            elif feature_type == 'tp':
                features_test = X_test_counts.toarray()
                features_test = np.where(features_test >= 1, 1, features_test)
            elif feature_type == 'tf-idf':
                #computing features from term frequencies (training set)
                features_test = tfidf_transformer.transform(X_test_counts)
                features_test = features_test.toarray()
                print "Please specify a valid feature type"
                return False

            #performing prediction of labels using the trained model
            if mlObj_var1.skl_classifiers[classifier[0]]['norm_feat_req']:
                features_test = sca.transform(features_test.astype(np.float))

            predicted = clf.predict(features_test)

            label_list_pred[test_inds] = predicted
            for ii, pred_val in enumerate(predicted):
                predicted_raga[test_inds[ii]] = map_raga[pred_val]

        cnt = 0
        for i in range(len(predicted_raga)):
            if raga_list[i] == predicted_raga[i]:
                cnt += 1
        print "You got %d number of ragas right for a total of %d number of recordings (Variant1)" % (
            cnt, len(predicted_raga))

        cMTC_var1 = confusion_matrix(label_list, label_list_pred)

        accuracy_var1 = float(cnt) / float(len(predicted_raga))

        ########################## End of variant 1 of cross fold testing ##################################
    accuracy_var2 = -1
    if var2:
        ########################### Performing cross fold train testing Variant 2 ###################################
        #In this variant tf-idf vectors are computed for the entire dataset. The only affect this will have is in the computatino of idf term.
        # This way is actually better because testing files will also contribute to the importance that is given to a work in the idf term.
        # Since computation of tf-idf is unsupervised (no raga label used) even this variant should be a valid experimental setup.
        count_vect = CountVectorizer(stop_words=stop_words)
        tfidf_transformer = TfidfTransformer(norm=norm_tfidf, smooth_idf=False)
        mlObj = ml.experimenter()
        )  #Note that only balanced classes option does stratified kfold exp
        #NOTE: balanceClasses will make sure each fold has equal number of samples from each class. and threre are equal number of feature instances per class

        docs_train = [
        ]  # this time we will use all the document in out dataset
        for ii in range(label_list.size):
            if per_rec_data.has_key(
            ):  #not every file has phrases found!! (there is one stupid file for which there are no phrases within this distance threshold)
                per_rec_words = ' '.join(
                    [com_id_2_uuid[p[0]] for p in per_rec_data[mbid_list[ii]]])
                per_rec_words = ''

        count_all = count_vect.fit_transform(docs_train)
        features = tfidf_transformer.fit_transform(count_all)
        if False:
            dump = {
                'features': features.toarray(),
                'labels': np.array(label_list)
            pickle.dump(dump, open('features_dump_300_Hindustani.pkl', 'w'))
            return True

        accuracy_var2 = mlObj.overallAccuracy

        print "You got %d number of ragas right for a total of %d number of recordings (Variant2)" % (
            np.round(mlObj.overallAccuracy * len(label_list)), len(label_list))

        ########################## End of variant 2 of cross fold testing ##################################

    ##saving experimental results
    fid = open(os.path.join(out_dir, 'experiment_results.pkl'), 'w')
    results = {}
    if var1:
            'var1': {
                'cm': cMTC_var1,
                'gt_label': label_list,
                'pred_label': label_list_pred,
                'mbids': mbid_list,
                'mapping': map_raga,
                'accuracy': float(cnt) / len(predicted_raga)
    if var2:
            'var2': {
                'cm': mlObj.cMTXExp,
                'gt_label': mlObj.classLabelsInt,
                'pred_label': mlObj.decArray,
                'mbids': mbid_list,
                'mapping': mlObj.cNames,
                'accuracy': mlObj.overallAccuracy,
                'pf_accuracy': mlObj.accuracy

    pickle.dump(results, fid)
    #also dumping the input params to this function
    params_input = {}
    for k in inspect.getargspec(raga_recognition_V2).args:
        params_input[k] = locals()[k]

    fid = open(os.path.join(out_dir, 'experiment_params.json'), 'w')
    json.dump(params_input, fid)

    return np.max([accuracy_var1, accuracy_var2])