Exemple #1
0
def gettingSolu(loadingFolder,allDataFolder):
    global FEATURE_NUMBER
    #var pour indiquer si on prend un toy example ou pas
    puppet = False
    #tableau qui contiendra toutes les features de tlm pr voir lesquelles contiennent des NaN
    tabF= None
    ctrl=7; pheno = 6

    newFrameLot = None
    dataFolder=os.path.join(allDataFolder, 'raw')
    listD = os.listdir(dataFolder)
    for plate in listD:
        print plate
        listW = os.listdir(os.path.join(dataFolder, plate))
        for well in listW:
            well=well[:-5]
            print well
#            from trajPack import d_ctrl
#            if well not in d_ctrl[plate]:
#                if pheno<=0:
#                    print "------------------------PAS PRIS------------------------"
#                    continue
#                else:
#                    print "PHENO"
#                    pheno-=1
#            else:
#                if ctrl<=0:
#                    print "------------------------PAS PRIS------------------------"
#                    continue
#                else:
#                    print "CONTROL"
#                    ctrl-=1
                
            filename = os.path.join(dataFolder, plate,well+".hdf5")
            if puppet:
                filenameT = os.path.join(allDataFolder, 'puppet_trainingset', 'PL'+plate+"___P"+well+"___T00000.xml")
            else:
                filenameT =os.path.join(allDataFolder,'trainingset', 'PL'+plate+"___P"+well+"___T00000.xml")
            
            #ajout du frameLot et du tabF
            frameLotC, tabFC = gettingRaw(filename, filenameT, plate, well)
            if frameLotC==None:
                sys.stdout.write("File {} containing data for plate {}, well {} does not contain all necessary data".format(filename, plate, well))
                continue
            if newFrameLot == None:
                newFrameLot = frameLotC 
            else: newFrameLot.addFrameLot(frameLotC)
            tabF = tabFC if tabF == None else np.vstack((tabF, tabFC))
    
    print "final training set content :"
    count, total= newFrameLot.statisticsTraining2()
    print count, total
    
    #en ce qui concerne le nettoyage des NaN
    c, f = treatments.whichAreNan(tabF)
    print tabF.shape
    featuresToDelete = f.keys()
    
    newFrameLot.clean(featuresToDelete) ##np.delete(X, f, 1)
    FEATURE_NUMBER -=len(featuresToDelete)

    fichier = open(os.path.join(loadingFolder, "featuresToDelete.pkl"), 'w')
    pickle.dump(featuresToDelete, fichier)
    fichier.close()

    print FEATURE_NUMBER
    print "uplets now"
    #ICI ON RECUPERE DONC LES SINGLETS ET DOUBLETS AVEC LA VALEUR DU TRAINING DANS CELL.TO SI ILS Y SONT, NONE SINON
    #POUR LE CENTRE ET LES FEATURES C'EST LA MOYENNE DES OBJETS DU SINGLET
    singlets, doublets = newFrameLot.getTrainingUplets()

    return j(singlets, doublets, FEATURE_NUMBER)
def gettingSolu(plate, pheno_only=False):
    global monOutput
    global FEATURE_NUMBER
    global loadingFolder
    global first
    global ctrl; global pheno
    #tableau qui contiendra toutes les features de tlm pr voir lesquelles contiennent des NaN
    tabF= None
    
    print "current directory ", os.getcwd()
    fichier = open(loadingFolder+"featuresToDelete.pkl", 'r')
    f = pickle.load(fichier)
    fichier.close()
    
    #print os.getcwd()
    newFrameLot = None

    print plate
    listW = os.listdir('/media/lalil0u/New/workspace2/Tracking/data/raw/'+plate)
    for well in listW:
        well=well[:-5]
        print well
        if pheno_only:
            if well not in d_ctrl[plate]:
                if pheno>0:
                    print "------------------------PAS PRIS------------------------"
                    pheno-=1
                    continue
                else:
                    print "PHENO"
            else:
                if ctrl>0:
                    print "------------------------PAS PRIS------------------------"
                    ctrl-=1
                    continue
                else:
                    print "CONTROL"
                
        filename = '/media/lalil0u/New/workspace2/Tracking/data/raw/'+plate+"/"+well+".hdf5"
        filenameT = '/media/lalil0u/New/workspace2/Tracking/data/TSinclMD/PL'+plate+"___P"+well+"___T00000.xml"

        monOutput+="plate = "+plate+",well = "+well+"\n"
        #ajout du frameLot et du tabF
        frameLotC, tabFC = test.gettingRaw(filename, filenameT, plate, well)
        if newFrameLot == None:
            newFrameLot = frameLotC 
        else: newFrameLot.addFrameLot(frameLotC)
        tabF = tabFC if tabF == None else np.vstack((tabF, tabFC))
    
    #    print "final training set content :"
    #    count, total= newFrameLot.statisticsTraining2()
    #    print count, total
    if newFrameLot is None:
        return None, None
        
    c, f2 = treatments.whichAreNan(tabF)
    #print len(f2.keys()), len(f)
#    if len(f)<len(f2.keys()):
#        pdb.set_trace()
#        
    #if there are features with NaN entries in the predict data but not in the training data
    toZeros = filter(lambda x: x not in f, f2.keys())
    #pdb.set_trace()
    if toZeros !=[]:
        msg="WARNING WARNING WARNING, some features here have NaN entries, and this was not the case in the training set. They are put to 0"
        warnings.warn(msg)
        newFrameLot.zeros(toZeros)
    newFrameLot.clean(f)
    if first:
        FEATURE_NUMBER -=len(f)
    
    print "Feature number", FEATURE_NUMBER
    print "Getting all uplets now"
    print "TIME TIME TIME", time.clock()
    #ICI ON RECUPERE DONC LES SINGLETS ET DOUBLETS AVEC LA VALEUR DU TRAINING DANS CELL.TO SI ILS Y SONT, NONE SINON
    #POUR LE CENTRE ET LES FEATURES C'EST LA MOYENNE DES OBJETS DU SINGLET
    singlets, doublets = newFrameLot.getAllUplets()
    #pour l'instant je ne garde que le passage de l'image 0 a 1
    print "TIME TIME TIME after getting all uplets", time.clock()
    print "Joining uplets now"
    
    solutions = joining.j(singlets, doublets, FEATURE_NUMBER, training = False)
    print "TIME TIME TIME after joining", time.clock()
    
    return solutions, newFrameLot
def gettingSolu():
    global monOutput
    global FEATURE_NUMBER
    #var pour indiquer si on prend un toy example ou pas
    puppet = False
    #tableau qui contiendra toutes les features de tlm pr voir lesquelles contiennent des NaN
    tabF= None
    
    #print os.getcwd()
    newFrameLot = None
    listD = os.listdir('/media/lalil0u/New/workspace2/Tracking/data/raw')
    for plate in listD:
        print plate
        listW = os.listdir('/media/lalil0u/New/workspace2/Tracking/data/raw/'+plate)
        for well in listW:
            well=well[:-5]
            print well
            filename = '/media/lalil0u/New/workspace2/Tracking/data/raw/'+plate+"/"+well+".hdf5"
            if puppet:
                filenameT = '/media/lalil0u/New/workspace2/Tracking/data/puppet_trainingset/PL'+plate+"___P"+well+"___T00000.xml"
            else:
                filenameT = '/media/lalil0u/New/workspace2/Tracking/data/trainingset/PL'+plate+"___P"+well+"___T00000.xml"
            
            
    
            monOutput+="plate = "+plate+",well = "+well+"\n"
            #ajout du frameLot et du tabF
            frameLotC, tabFC = gettingRaw(filename, filenameT, plate, well)
            if newFrameLot == None:
                newFrameLot = frameLotC 
            else: newFrameLot.addFrameLot(frameLotC)
            tabF = tabFC if tabF == None else np.vstack((tabF, tabFC))
    
    print "final training set content :"
    count, total= newFrameLot.statisticsTraining2()
    print count, total
    
    #en ce qui concerne le nettoyage des NaN
    c, f = treatments.whichAreNan(tabF)
    print tabF.shape
    featuresToDelete = f.keys()
    
#    for morpho in filter(lambda x : x not in featuresToDelete, l_indexes):
#        featuresToDelete.append(morpho)
    tabF = treatments.clean(tabF, f.keys())
    
    newFrameLot.clean(featuresToDelete) ##np.delete(X, f, 1)
    FEATURE_NUMBER -=len(featuresToDelete)
    fichier = open("../results/featuresToDelete.pkl", 'w')
    pickle.dump(featuresToDelete, fichier)
    fichier.close()

    print FEATURE_NUMBER
    #lstCellsT, lstCellsF, X, Y, Xz, Z = newFrameLot.getTraining2()
    print "uplets now"
    #ICI ON RECUPERE DONC LES SINGLETS ET DOUBLETS AVEC LA VALEUR DU TRAINING DANS CELL.TO SI ILS Y SONT, NONE SINON
    #POUR LE CENTRE ET LES FEATURES C'EST LA MOYENNE DES OBJETS DU SINGLET
    singlets, doublets = newFrameLot.getTrainingUplets()
    #print "la je dois retrouver le training normalement :"
    ##SACHANT QUE LES EVENEMENTS A PLUS DE DEUX NE SONT PAS PRIS EN COMPTE
    #print "SINGLETS"
    #merge = 0
    #move = 0
    #split = 0
    #dis = 0
    #app = 0
    #for p in singlets:
    #    for w in singlets[p]:
    #        for i in singlets[p][w]:
    #            sin = singlets[p][w][i]
    #            print p, w, i
    #            out = ""
    #            for s in sin:
    #                out+=str(s.label)+" "
    #                if s.to is not None and len(s.to)>1 and s.label !=-1: 
    #                    #print "-----------------split", s.label, s.to
    #                    split+=1
    #                elif s.to==(-1,): dis+=1
    #                elif s.to is not None and s.label == -1 : 
    #                    app+=len(s.to)
    #                    print "**********appear", s.label, s.to
    #                elif s.to is not None and len(s.to)==1: move +=1
    #                elif s.to is not None: print "surprise", s.to
    #                try: 
    #                    if len(s.fr)>1:
    #                        print "la il y a un merge que je devrais retrouver dans la liste des doublets", s.fr
    #                except TypeError:
    #                    if s.label !=-1:
    #                        print "type error"
    #                finally:
    #                    pass
    #            print out
    #print move, split, dis, app
    #print "DOUBLETS"
    #for p in doublets:
    #    for w in doublets[p]:
    #        for i in doublets[p][w]:
    #            sin = doublets[p][w][i]
    #            #print p, w, i
    #            for s in sin:
    #                #print s.label, s.to
    #                if s.to is not None: merge+=1
    #print merge
    
    #f, featuresNames = treatments.returnBadFeatures()
    #VOIR AUSSI : EST-CE QUE L'ON NORMALISE ?
#    featuresToKeep = [221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238]
    solutions = joining.j(singlets, doublets, FEATURE_NUMBER)
    #la j'ecris tlm ds un fichier txt au cas ou
#    file = open("results/outputSoluUnNorm.txt", "w")
#    file.write(solutions.output())
#    file.close()
    
    return solutions
def gettingSolu(loadingFolder, hdf5Folder = '/media/lalil0u/New/workspace2/Tracking/data/predict/', plate=None, wellL=None, training = False, first=True, xb_screen=False):
    global monOutput
    global FEATURE_NUMBER
    
    tabF = None
    #var pour indiquer si on prend un toy example ou pas
    #tableau qui contiendra toutes les features de tlm pr voir lesquelles contiennent des NaN
#    print "current directory ", os.getcwd()
    fichier = open(loadingFolder+"featuresToDelete.pkl", 'r')
    f = pickle.load(fichier)
    fichier.close()

    newFrameLot = None
        
    listP = os.listdir(hdf5Folder)
    if plate is not None:
        listP=[plate]
        
    for plate in listP:
        print plate
        listW = sorted(os.listdir(os.path.join(hdf5Folder, plate, 'hdf5')))
        if wellL is not None:
            listW = wellL
        for well in listW[:18]:
            well=well.split('_')[0]+'_'+well.split('_')[1][:2]
            if not xb_screen:
                filename = os.path.join(hdf5Folder, plate, 'hdf5', well+".hdf5")
            else:
                filename = os.path.join(hdf5Folder, plate, 'hdf5', well+".ch5")
            print well

                                
            if training:
                filenameT = '/media/lalil0u/New/workspace2/Tracking/data/trainingset/PL'+plate+"___P"+well+"___T00000.xml"
            else:
                filenameT = None
                
            monOutput+="plate = "+plate+",well = "+well+"\n"
            #ajout du frameLot et du tabF
            frameLotC, tabFC = gettingRaw(filename, filenameT, plate, well, name_primary_channel='primary__primary3')
            if newFrameLot == None:
                newFrameLot = frameLotC 
            else: newFrameLot.addFrameLot(frameLotC)
            tabF = tabFC if tabF == None else np.vstack((tabF, tabFC))
    
    #en ce qui concerne le nettoyage des NaN
    c, f2 = treatments.whichAreNan(tabF)
    print len(f2.keys()), len(f)
    #if there are features with NaN entries in the predict data but not in the training data
    toZeros = filter(lambda x: x not in f, f2.keys())
    if toZeros !=[]:
        print "Attention attention, some features here have NaN entries, and this was not the case in the training set. They are put to 0"
        newFrameLot.zeros(toZeros)
    
#    if f!= f2.keys():
#        featuresNames = imp.importFeaturesNames(filename)
#        print filter(lambda x: x not in f, f2.keys())
#        pdb.set_trace()
    newFrameLot.clean(f) ##np.delete(X, f, 1)
    if first:
        FEATURE_NUMBER -=len(f)

    #FEATURE_NUMBER -=len(bou)
    
    print FEATURE_NUMBER
    print "Getting all uplets now"
    print "TIME TIME TIME", time.clock()
    #ICI ON RECUPERE DONC LES SINGLETS ET DOUBLETS AVEC LA VALEUR DU TRAINING DANS CELL.TO SI ILS Y SONT, NONE SINON
    #POUR LE CENTRE ET LES FEATURES C'EST LA MOYENNE DES OBJETS DU SINGLET
    if training == False:
        singlets, doublets = newFrameLot.getAllUplets(loadingFolder)
    else:
        singlets, doublets = newFrameLot.getTrainingUplets()
    print "TIME TIME TIME after getting all uplets", time.clock()
    print "joining uplets now"

    solutions = j(singlets, doublets, FEATURE_NUMBER, training)
    print "TIME TIME TIME after joining", time.clock()
    print "normalization"
    
    fichier = open(loadingFolder+"minMax_data_all.pkl", "r")  
    minMax = pickle.load(fichier)
    fichier.close()
    solutions.normalisation(minMax)
    print "TIME TIME TIME after normalization", time.clock()
    
    return solutions