def _dataPrep(self, pcaParameter):
        histDict = defaultdict(list)

        ctrlExp = appendingControl(self.expList)
        ctrlExp = countingDone(ctrlExp)
        np.random.shuffle(ctrlExp)
        ctrlExp = ctrlExp[: int(0.2 * len(self.expList))]
        if self.verbose:
            print ctrlExp
        self.expList.extend(ctrlExp)

        _, r, _, _, _, length, _, _, _ = histConcatenation(
            self.settings.data_folder,
            self.expList,
            self.settings.mitocheck_file,
            self.settings.quality_control_file,
            verbose=self.verbose,
        )
        for i in range(len(length)):
            for k, feature in enumerate(self.currInterestFeatures):
                histDict[feature].append(r[np.sum(length[:i]) : np.sum(length[: i + 1]), featuresSaved.index(feature)])

        f = open(
            os.path.join(self.settings.result_folder, "distExp_ctrl_{}_{}.pkl".format(self.bins_type, self.bin_size))
        )
        bins = pickle.load(f)
        f.close()

        histogrammes, bins = computingBins(
            histDict,
            [self.bin_size for k in range(len(self.currInterestFeatures))],
            self.bins_type,
            previous_binning=bins,
        )
        print histogrammes.shape
        return histogrammes, bins
def globalSummaryScript(baseName,  siRNAFile,
                        n_clusters_min, n_clusters_max,
                       div_name,  lambda_,  weights, 
                       bins_type,  bin_size,  cost_type,
                       batch_size,  n_init,  init, 
                       ddim, iter_=0):
    
    f=open(siRNAFile, 'r')
    siRNAList = pickle.load(f); f.close()
    
    siExpDict = expSi(qc = quality_control_file , sens=0)
    jobCount = 0
    i=0
    total_expList = []
    head = """#!/bin/sh
cd %s""" %progFolder
    baseName = baseName+'{}{}_w{}_{}_{}_{}'.format(iter_,div_name[:5], weights, bins_type, bin_size, cost_type)
#A. DEALING WITH EXPERIMENTS
    for siRNA in siRNAList:
        try:
            expList = siExpDict[siRNA]
        except KeyError:
            print "siRNA not in siRNA-experiment dictionary"
        else:
            expList = strToTuple(expList, os.listdir(data_folder))
            total_expList.extend(expList)
            for plate, well in expList:        
                jobCount += 1; i+=1
                cmd = plateWellSummaryScript(plate, well, div_name, lambda_, weights, bins_type, bin_size, cost_type, batch_size, n_init, init, ddim, iter_)

                # this is now written to a script file (simple text file)
                # the script file is called ltarray<x>.sh, where x is 1, 2, 3, 4, ... and corresponds to the job index.
                script_name = os.path.join(scriptFolder, baseName+'{}.sh'.format(i))
                script_file = file(script_name, "w")
                script_file.write(head + cmd)
                script_file.close()
        
                # make the script executable (without this, the cluster node cannot call it)
                os.system('chmod a+x %s' % script_name)
    
#B. DEALING WITH CONTROLS
    ctrlExp = appendingControl(total_expList)
    ctrlExp = countingDone(ctrlExp)
    np.random.shuffle(ctrlExp)
    ctrlExp=ctrlExp[:int(0.2*len(total_expList))]
    for plate, well in ctrlExp:
        jobCount += 1; i+=1
        cmd = plateWellSummaryScript(plate, well, div_name, lambda_, weights, bins_type, bin_size, cost_type, batch_size, n_init, init, ddim, iter_)

        # this is now written to a script file (simple text file)
        # the script file is called ltarray<x>.sh, where x is 1, 2, 3, 4, ... and corresponds to the job index.
        script_name = os.path.join(scriptFolder, baseName+'{}.sh'.format(i))
        script_file = file(script_name, "w")
        script_file.write(head + cmd)
        script_file.close()

        # make the script executable (without this, the cluster node cannot call it)
        os.system('chmod a+x %s' % script_name)
    
            # write the main script
    array_script_name = '%s.sh' % os.path.join(scriptFolder, baseName)
    main_script_file = file(array_script_name, 'w')
    main_content = """#!/bin/sh
%s
#$ -o %s
#$ -e %s
%s$%s.sh
""" % (path_command,
       pbsOutDir,  
       pbsErrDir, 
       os.path.join(scriptFolder, baseName),
       pbsArrayEnvVar)

    main_script_file.write(main_content)
    main_script_file.close()
    os.system('chmod a+x %s' % array_script_name)
    sub_cmd = 'qsub -t 1-%i %s' % (jobCount, array_script_name)

    print sub_cmd
    
#C. DOING EXPERIMENT CLUSTERING STEP
    expFilename = 'exp_Simpson_{}.pkl'.format(int(time.time()))
    total_expList.extend(ctrlExp)
    f=open(expFilename, 'w')
    pickle.dump(total_expList, f)
    f.close()
    baseName = baseName+'_clustering'
    for n_clusters in range(n_clusters_min, n_clusters_max):
        script_name = os.path.join(scriptFolder, baseName+'{}.sh'.format(n_clusters-n_clusters_min))
        script_file = file(script_name, "w")
        cmd="""
    python tracking/histograms/summarization_clustering.py -a clustering --experimentFile %s -k %i --ddimensional %i --bins_type %s --cost_type %s --bin_size %i --div_name %s -w %i --init %s --batch_size %i --iter %i
    """
        cmd %= (
                expFilename,
                n_clusters,
                 ddim,
                 bins_type,
                 cost_type,
                 bin_size,
                 div_name,
                 weights,
                 init,
                 batch_size,
                 iter_
            )
        script_file.write(head + cmd)
        script_file.close()
        os.system('chmod a+x %s' % script_name)
    
                # write the main script
    array_script_name = '%s.sh' % os.path.join(scriptFolder, baseName)
    main_script_file = file(array_script_name, 'w')
    main_content = """#!/bin/sh
%s
#$ -o %s
#$ -e %s
%s$%s.sh
""" % (path_command,
       pbsOutDir,  
       pbsErrDir, 
       os.path.join(scriptFolder, baseName),
       pbsArrayEnvVar)

    main_script_file.write(main_content)
    main_script_file.close()
    os.system('chmod a+x %s' % array_script_name)
    sub_cmd = 'qsub -hold_jid  -t 1-%i %s' % (n_clusters_max - n_clusters_min, array_script_name)

    print sub_cmd
    
#D. GOING BACK TO EXPERIMENTS AND TESTING IF DIFFERENT FROM CONTROLS
    
    return 1