Beispiel #1
0
def clustpipeline(datapath, mapfile=None, replicatesfile=None, normalisationfile=['1000'], outpath=None,
                  Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0,
                  OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0,
                  atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11,
                  ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False):
    # Set the global objects label
    if mapfile is None:
        glob.set_object_label_upper('Gene')
        glob.set_object_label_lower('gene')
    else:
        glob.set_object_label_upper('OG')
        glob.set_object_label_lower('OG')

    # Output: Prepare the output directory and the log file
    if outpath is None:
        outpathbase = os.getcwd()
        #outpathbase = os.path.abspath(os.path.join(datapath, '..'))
        outpathbase = '{0}/Results_{1}'.format(outpathbase, dt.datetime.now().strftime('%d_%b_%y'))
        outpath = outpathbase
        trial = 0
        while os.path.exists(outpath):
            trial += 1
            outpath = '{0}_{1}'.format(outpathbase, trial)
    if not os.path.exists(outpath):
        os.makedirs(outpath)

    glob.set_logfile(os.path.join(outpath, 'log.txt'))
    glob.set_tmpfile(os.path.join(outpath, 'tmp.txt'))

    # Output: Copy input files to the output
    in2out_path = outpath + '/Input_files_and_params'
    if not os.path.exists(in2out_path):
        os.makedirs(in2out_path)

    if mapfile is not None:
        shutil.copy(mapfile, os.path.join(in2out_path, 'Map.txt'))
    if replicatesfile is not None:
        shutil.copy(replicatesfile, os.path.join(in2out_path, 'Replicates.txt'))
    if normalisationfile is not None:
        if len(normalisationfile) == 1 and not nu.isint(normalisationfile[0]):
            shutil.copy(normalisationfile[0], os.path.join(in2out_path, 'Normalisation.txt'))

    in2out_X_unproc_path = in2out_path + '/Data'
    if not os.path.exists(in2out_X_unproc_path):
        os.makedirs(in2out_X_unproc_path)
    if os.path.isfile(datapath):
        shutil.copy(datapath, in2out_X_unproc_path)
    elif os.path.isdir(datapath):
        for df in io.getFilesInDirectory(datapath):
            shutil.copy(os.path.join(datapath, df), in2out_X_unproc_path)
    else:
        raise ValueError('Data path {0} does not exist. Either provide a path '.format(datapath) + \
                         'of a data file or a path to a directory including data file(s)')


    # Output: Print initial message, and record the starting time:
    initialmsg, starttime = op.generateinitialmessage()
    io.log(initialmsg, addextrastick=False)

    # Read data
    io.log('1. Reading dataset(s)')
    (X, replicates, Genes, datafiles) = io.readDatasetsFromDirectory(datapath, delimiter='\t| |, |; |,|;', skiprows=1, skipcolumns=1,
                                                                     returnSkipped=True)
    datafiles_noext = [os.path.splitext(d)[0] for d in datafiles]

    # Read map, replicates, and normalisation files:
    Map = io.readMap(mapfile)
    (replicatesIDs, conditions) = io.readReplicates(replicatesfile, datapath, datafiles, replicates)
    normalise = io.readNormalisation(normalisationfile, datafiles)

    # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering
    # low expression genes, and normalisation)
    io.log('2. Data pre-processing')
    (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \
        = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+',
                                           OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets)
    (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \
        pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None,
                      expressionValueThreshold=expressionValueThreshold, replacementVal=0.0,
                      atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue,
                      filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles)
    io.writedic('{0}/Normalisation_actual.txt'.format(outpath), applied_norms, delim='\t')
    OGs = OGs[Iincluded]
    if MapNew is not None:
        MapNew = MapNew[Iincluded]

    # Output: Save processed data
    Xprocessed = op.processed_X(X_summarised_normalised, conditions, GDM, OGs, MapNew, MapSpecies)  # pandas DataFrames
    X_proc_path = outpath + '/Processed_Data'
    if not os.path.exists(X_proc_path):
        os.makedirs(X_proc_path)
    for l in range(len(datafiles)):
        pd.DataFrame.to_csv(Xprocessed[l], '{0}/{1}_processed.tsv'.format(X_proc_path, datafiles[l]), sep='\t', encoding='utf-8', index=None, columns=None, header=False)
        #np.savetxt('{0}/{1}_processed.tsv'.format(X_proc_path, datafiles[l]), Xprocessed[l], fmt='%s', delimiter='\t')


    # UNCLES and M-N plots
    io.log('3. Seed clusters production (the Bi-CoPaM method)')
    ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods,
                      Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic)
    io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)')
    mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight,
                             params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext,
                             ncores=ncores)

    # Post-processing
    ppmethod = 'tukey_sqrtSCG'
    if optimisation:
        io.log('5. Cluster optimisation and completion')
        if len(mnres.I) > 0 and sum(mnres.I) > 0:  # Otherwise, there are no clusters, so nothing to be corrected
            try:
                if ppmethod == 'weighted_outliers':
                    B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], stds, smallestClusterSize)
                elif ppmethod == 'tukey_sqrtSCG':
                    B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], smallestClusterSize,
                                                               tails=1, Q3s=Q3s)
                else:
                    raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod))
                B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM)
            except:
                io.logerror(sys.exc_info())
                io.log('\n* Failed to perform cluster optimisation and completion!\n'
                       '* Skipped cluster optimisation and completion!\n')
                B_corrected = mnres.B
        else:
            B_corrected = mnres.B
    else:
        io.log('5. Skipping cluster optimisation and completion')
        B_corrected = mnres.B


    # Output: Write input parameters:
    io.log('6. Saving results in\n{0}'.format(outpath))
    inputparams = op.params(mnres.params, Q3s, OGsIncludedIfAtLeastInDatasets,
                            expressionValueThreshold, atleastinconditions, atleastindatasets,
                            deterministic, ures.params['methods'], MapNew)
    io.writedic('{0}/input_params.tsv'.format(in2out_path), inputparams, delim='\t')

    # Output: Generating and saving clusters
    res_og = op.clusters_genes_OGs(B_corrected, OGs, MapNew, MapSpecies, '; ')  # pandas DataFrame
    if mapfile is None:
        pd.DataFrame.to_csv(res_og, '{0}/Clusters_Objects.tsv'.format(outpath), sep='\t',
                            encoding='utf-8', index=None, columns=None, header=False)
        #np.savetxt('{0}/Clusters_Objects.tsv'.format(outpath), res_og, fmt='%s', delimiter='\t')
    else:
        pd.DataFrame.to_csv(res_og, '{0}/Clusters_OGs.tsv'.format(outpath), sep='\t',
                            encoding='utf-8', index=None, columns=None, header=False)
        #np.savetxt('{0}/Clusters_OGs.tsv'.format(outpath), res_og, fmt='%s', delimiter='\t')
        res_sp = op.clusters_genes_Species(B_corrected, OGs, MapNew, MapSpecies)  # pandas DataFrame
        for sp in range(len(res_sp)):
            pd.DataFrame.to_csv(res_sp[sp], '{0}/Clusters_{1}.tsv'.format(outpath, MapSpecies[sp]), sep='\t',
                                encoding='utf-8', index=None, columns=None, header=False)
            #np.savetxt('{0}/Clusters_{1}.tsv'.format(outpath, MapSpecies[sp]), res_sp[sp], fmt='%s', delimiter='\t')

    # Output: Save figures to a PDF

    try:
        if np.shape(B_corrected)[1] > 0:
            clusts_fig_file_name = '{0}/Clusters_profiles.pdf'.format(outpath)
            graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, clusts_fig_file_name,
                               GDM=GDM, Cs='all', setPageToDefault=True, printToPDF=True, showPlots=False)
    except:
        io.log('Error: could not save clusters plots in a PDF file.\n'
               'Resuming producing the other results files ...')

    # Output: Generating and writing eigengenes
    try:
        if np.shape(B_corrected)[1] > 0:
            if len(X_summarised_normalised) == 1:
                eigengene_matrix = eig.eigengenes_dataframe(X_summarised_normalised, B_corrected, conditions)
                eigengene_matrix.to_csv('{0}/Eigengenes.tsv'.format(outpath), sep='\t',
                                    encoding='utf-8')
            else:
                io.log('Eigengene computation is currently not supported for multiple datasets.')
    except:
        io.log('Error: could not save eigengenes into a file.\n'
               'Resuming producing the other results files ...')

    # Output: Prepare message to standard output and the summary then save the summary to a file and print the message
    summarymsg, endtime, timeconsumedtxt = \
        op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM,
                                      ures, mnres, B_corrected, starttime)
    summary = op.summarise_results(X, X_summarised_normalised, MapNew, GDMall, GDM,
                                   ures, mnres, B_corrected, starttime, endtime, timeconsumedtxt)
    io.writedic(outpath + '/Summary.tsv', summary, delim='\t')
    io.log(summarymsg, addextrastick=False)

    io.deletetmpfile()
Beispiel #2
0
def uncles(X,
           type='A',
           Ks=[n for n in range(4, 21, 4)],
           params=None,
           methods=None,
           methodsDetailed=None,
           U=None,
           Utype='PM',
           relabel_technique='minmin',
           setsP=None,
           setsN=None,
           dofuzzystretch=False,
           wsets=None,
           wmethods=None,
           GDM=None,
           smallestClusterSize=11,
           CoPaMfinetrials=1,
           CoPaMfinaltrials=1,
           binarise_techniqueP='DTB',
           binarise_paramP=np.arange(0.0, 1.1, 0.1, dtype='float'),
           binarise_techniqueN='DTB',
           binarise_paramN=np.concatenate(([sys.float_info.epsilon],
                                           np.arange(0.1,
                                                     1.1,
                                                     0.1,
                                                     dtype='float'))),
           Xnames=None,
           deterministic=False,
           ncores=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    L = len(Xloc)  # Number of datasets

    # Fix parameters
    if params is None: params = {}
    if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))]
    if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)]
    setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int)
    Xloc = Xloc[setsPN]
    L = np.shape(Xloc)[0]  # Number of datasets
    if wsets is None:
        wsets = np.array([1 for x in range(L)])
    else:
        wsets = np.array(wsets)[setsPN]
    if GDM is None:
        Ng = np.shape(Xloc[0])[0]
        GDMloc = np.ones([Ng, L], dtype='bool')
    else:
        GDMloc = GDM[:, setsPN]
        Ng = GDMloc.shape[0]
    if Xnames is None:
        Xnames = ['X{0}'.format(l) for l in range(L)]

    if methods is None:
        methods = [['k-means']]
        # largest_DS = np.max([x.shape[0] for x in Xloc])
        # if (largest_DS <= maxgenesinsetforpdist):
        #    methods = [['k-means'], ['HC']]
        # else:
        #    methods = [['k-means']]
    else:
        largest_DS = np.max([x.shape[0] for x in Xloc])
        if (largest_DS > maxgenesinsetforpdist):
            methods = [
                m for m in methods
                if 'hc' not in [entry.lower() for entry in m]
            ]
            if not methods:
                io.log('No valid base clustering can be used. Please note that clust would not use HC clustering ' \
                       'on datasets with more than {0} genes. You have a dataset with {1} genes.' \
                       ''.format(maxgenesinsetforpdist, largest_DS))
                io.log('Clust will terminate here.')
                io.log(op.bottomline(), addextrastick=False)
                sys.exit()
    if methodsDetailed is None:
        methodsDetailedloc = np.array([methods for l in range(L)])
    else:
        methodsDetailedloc = methodsDetailed[setsPN]
    if wmethods is None:
        wmethods = [[1 for x in m] for m in methodsDetailedloc]
    elif not isinstance(wmethods[0], (list, tuple, np.ndarray)):
        wmethods = np.tile(methods, [L, 1])
    else:
        wmethods = np.array(wmethods)[setsPN]

    setsPloc = [ii for ii in range(len(setsP))]
    if L > len(setsPloc):
        setsNloc = [ii for ii in range(len(setsPloc), L)]

    Ks = np.array(Ks)
    Ks = Ks[Ks <= Ng]  # Remove Ks that are larger than the number of genes Ng
    Ks = Ks.tolist()
    NKs = len(Ks)  # Number of K values

    # If the dataset is empty, return basic output
    if Ng == 0:
        NPp = len(binarise_paramP)  # Number of P params
        NNp = len(binarise_paramN)  # Number of N params
        if type == 'A':
            B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object)
            Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)
        elif type == 'B':
            B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object)
            Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)

        params = dict(
            params, **{
                'methods':
                methods,
                'setsP':
                setsPloc,
                'setsN':
                setsNloc,
                'dofuzzystretch':
                dofuzzystretch,
                'type':
                type,
                'Ks':
                Ks,
                'NKs':
                NKs,
                'wsets':
                wsets,
                'wmethods':
                wmethods,
                'Ds':
                Ds,
                'L':
                L,
                'CoPaMs':
                np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
                    [CoPaMfinaltrials, NKs]),
                'smallestclustersize':
                smallestClusterSize,
                'GDM':
                GDMloc
            })

        Uloc = np.array([None] * (L * NKs)).reshape([L, NKs])

        UnclesRes = collections.namedtuple('UnclesRes',
                                           ['B', 'Mc', 'params', 'X', 'U'])
        return UnclesRes(B, Mc, params, Xloc, Uloc)

    # Clustering
    if U is None:
        Utype = 'PM'
        Uloc = np.array([None] * (L * NKs)).reshape([L, NKs])
        totalparallel = np.sum(Ks) * np.sum(
            [len(meths) for meths in methodsDetailedloc])
        for meths in methodsDetailedloc:
            for meth in meths:
                if 'k-means' in meth:
                    totalparallel += np.max(Ks) * np.max(Ks)
                    continue
        io.resetparallelprogress(totalparallel)

        for l in range(L):
            # Cache kmeans initialisations for the dataset once to save time:
            cl.cache_kmeans_init(Xloc[l],
                                 Ks,
                                 methodsDetailedloc[l],
                                 datasetID=l)

            # Now go to parallel clustering
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                Utmp = Parallel(n_jobs=ncores)\
                    (delayed(clustDataset)
                     (Xloc[l], Ks[ki], methodsDetailedloc[l], GDMloc[:, l], Ng, l) for ki in range(NKs))

                Utmp = [u for u in Utmp]
                for ki in range(NKs):
                    Uloc[l, ki] = Utmp[ki]

                gc.collect()
                #io.updateparallelprogress(np.sum(Ks) * len(methodsDetailedloc))

    else:
        Uloc = ds.listofarrays2arrayofarrays(U)[setsPN]

    # Calculate a CoPaM for each dataset at each K
    CoPaMsFine = np.array([None] * (L * NKs)).reshape([L, NKs])
    for l in range(L):
        for ki in range(NKs):
            if Utype.lower() == 'pm':
                CoPaMsFineTmp = [
                    generateCoPaM(Uloc[l, ki],
                                  relabel_technique=relabel_technique,
                                  X=[Xloc[l]],
                                  w=wmethods[l],
                                  K=Ks[ki],
                                  GDM=GDMloc[:, l].reshape([-1, 1]))
                    for i in range(CoPaMfinetrials)
                ]
            elif Utype.lower() == 'idx':
                CoPaMsFineTmp = \
                    [generateCoPaMfromidx(Uloc[l, ki], relabel_technique=relabel_technique, X=Xloc,
                                          w=wmethods[l], K=Ks[ki])
                     for i in range(CoPaMfinetrials)]
            else:
                raise ValueError('Invalid Utype')
            CoPaMsFine[l,
                       ki] = generateCoPaM(CoPaMsFineTmp,
                                           relabel_technique=relabel_technique,
                                           X=[Xloc[l]],
                                           GDM=GDMloc[:, l].reshape([-1, 1]))

            if dofuzzystretch:
                CoPaMsFine[l, ki] = fuzzystretch(CoPaMsFine[l, ki])

    # Calculate the final CoPaM for each K
    CoPaMs = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    CoPaMsP = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    CoPaMsN = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    for t in range(CoPaMfinaltrials):
        for ki in range(NKs):
            if type == 'A':
                if Utype.lower() == 'pm':
                    CoPaMs[t, ki] = generateCoPaM(
                        CoPaMsFine[:, ki],
                        relabel_technique=relabel_technique,
                        w=wsets,
                        X=Xloc,
                        GDM=GDMloc)
                elif Utype.lower() == 'idx':
                    CoPaMs[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[:, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets,
                        GDM=GDMloc)
                else:
                    raise ValueError('Invalid Utype')
            elif type == 'B':
                if Utype.lower() == 'pm':
                    CoPaMsP[t, ki] = generateCoPaM(
                        CoPaMsFine[setsPloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsPloc],
                        GDM=GDMloc[:, setsPloc])
                    CoPaMsN[t, ki] = generateCoPaM(
                        CoPaMsFine[setsNloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsNloc],
                        GDM=GDMloc[:, setsNloc])
                elif Utype.lower() == 'idx':
                    CoPaMsP[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[setsPloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsPloc],
                        GDM=GDMloc[:, setsPloc])
                    CoPaMsN[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[setsNloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsNloc],
                        GDM=GDMloc[:, setsNloc])
                else:
                    raise ValueError('Invalid Utype')
            else:
                raise ValueError(
                    'Invalid UNCLES type. It has to be either A or B')

    # Binarise
    NPp = len(binarise_paramP)  # Number of P params
    NNp = len(binarise_paramN)  # Number of N params
    if type == 'A':
        B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object)
        Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)
    elif type == 'B':
        B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object)
        Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)

    for t in range(CoPaMfinaltrials):
        for ki in range(NKs):
            if type == 'A':
                # Pre-sorting binarisation
                for p in range(NPp):
                    B[t, p, 0, ki] = binarise(CoPaMs[t,
                                                     ki], binarise_techniqueP,
                                              binarise_paramP[p])
                Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]]

                # Sorting
                CoPaMs[t, ki] = sortclusters(CoPaMs[t, ki], Mc[t, ki],
                                             smallestClusterSize)

                # Post-sorting binarisation
                for p in range(NPp):
                    B[t, p, 0, ki] = binarise(CoPaMs[t,
                                                     ki], binarise_techniqueP,
                                              binarise_paramP[p])
                Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]]
            elif type == 'B':
                # Pre-sorting binarisation
                BP = [
                    binarise(CoPaMsP[t, ki], binarise_techniqueP,
                             binarise_paramP[p]) for p in range(NPp)
                ]
                McP = [np.sum(BPp, axis=0) for BPp in BP]

                BN = [
                    binarise(CoPaMsN[t, ki], binarise_techniqueN,
                             binarise_paramN[p]) for p in range(NNp)
                ]
                McN = [np.sum(BNp, axis=0) for BNp in BN]

                # Sorting
                CoPaMsP[t, ki] = sortclusters(CoPaMsP[t, ki], McP,
                                              smallestClusterSize)
                CoPaMsN[t, ki] = sortclusters(CoPaMsN[t, ki], McN,
                                              smallestClusterSize)

                # Post-sorting binarisation
                BP = [
                    binarise(CoPaMsP[t, ki], binarise_techniqueP,
                             binarise_paramP[p]) for p in range(NPp)
                ]
                McP = [np.sum(BPp, axis=0) for BPp in BP]

                BN = [
                    binarise(CoPaMsN[t, ki], binarise_techniqueN,
                             binarise_paramN[p]) for p in range(NNp)
                ]
                McN = [np.sum(BNp, axis=0) for BNp in BN]

                # UNCLES B logic
                for pp in range(NPp):
                    for pn in range(NNp):
                        B[t, pp, pn, ki] = BP[pp]
                        B[t, pp, pn, ki][np.any(BN[pn], axis=1)] = False

                # Fill Mc
                Mc[t, ki] = [None] * Ks[ki]
                for k in range(Ks[ki]):
                    Mc[t, ki][k] = np.zeros([NPp, NNp])
                    for pp in range(NPp):
                        for pn in range(NNp):
                            Mc[t, ki][k][pp, pn] = np.sum(B[t, pp, pn, ki][:,
                                                                           k])

    # Prepare and return the results:
    params = dict(
        params, **{
            'methods': methods,
            'setsP': setsPloc,
            'setsN': setsNloc,
            'dofuzzystretch': dofuzzystretch,
            'type': type,
            'Ks': Ks,
            'NKs': NKs,
            'wsets': wsets,
            'wmethods': wmethods,
            'L': L,
            'CoPaMs': CoPaMs,
            'smallestclustersize': smallestClusterSize,
            'GDM': GDMloc
        })

    UnclesRes = collections.namedtuple('UnclesRes',
                                       ['B', 'Mc', 'params', 'X', 'U'])
    return UnclesRes(B, Mc, params, Xloc, Uloc)
Beispiel #3
0
def runclust(X, Map=None, replicatesIDs=None, normalise=1000,
             Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0,
             OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0,
             atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11,
             ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False, showPlots=True,
             printToConsole=True):

    # Set the global objects label
    glob.set_print_to_log_file(False)
    glob.set_print_to_console(printToConsole)
    if Map is None:
        glob.set_object_label_upper('Gene')
        glob.set_object_label_lower('gene')
    else:
        glob.set_object_label_upper('OG')
        glob.set_object_label_lower('OG')

    glob.set_tmpfile('clust_tmp.txt')


    # Output: Print initial message, and record the starting time:
    initialmsg, starttime = op.generateinitialmessage()
    io.log(initialmsg, addextrastick=False)

    # Consider X as a list of arrays or of data frames. Otherwise, make it as such first
    # If the user entered a single dataset as an input (not as a list of arrays), save this fact in a flag, ...
    # so the result is returned as a single output
    input_is_one_dataset = False
    if isinstance(X, pd.DataFrame):
        input_is_one_dataset = True
        X = [X]
    elif isinstance(X, np.ndarray) and ds.maxDepthOfArray(X) == 2:
        input_is_one_dataset = True
        X = [X]

    # Format data (X: list of arrays, Genes: list of arrays of strings, replicates: list of arrays of strings)
    L = len(X)  # Number of datasets
    replicates = [None] * L
    Genes = [None] * L
    io.log('1. Reading dataset(s)')
    for l in range(L):
        if type(X[l]) == pd.DataFrame:
            Genes[l] = np.array(X[l].index, dtype=str, ndmin=2).transpose()
            Genes[l] = np.array(Genes[l], dtype=object)
            replicates[l] = np.array(X[l].columns, dtype=str)
            X[l] = X[l].values
        else:
            X[l] = np.array(X[l])

            ngenes_digits = int(math.ceil(math.log10(X[l].shape[0])))
            nreps_digits = int(math.ceil(math.log10(X[l].shape[1])))
            Genes[l] = np.array([['{0}'.format(str(g).zfill(ngenes_digits))] for g in range(X[l].shape[0])])
            Genes[l] = np.array(Genes[l], dtype=object)
            replicates[l] = np.array(['X{0}'.format(str(r).zfill(nreps_digits)) for r in range(X[l].shape[1])])

    ndatasets_digits = int(math.ceil(math.log10(L)))
    datafiles = np.array(['D{0}'.format(str(r).zfill(ndatasets_digits)) for r in range(L)])
    datafiles_noext = datafiles

    # Sort out conditions based on replicates structure if given
    if replicatesIDs is None:
        conditions = replicates
    else:
        valresult = val.validate_replicatesIDs(replicatesIDs, X)
        if valresult[0]:
            conditions = [None] * L
            for l in range(L):
                if replicatesIDs[l] is None:
                    conditions[l] = np.array(replicates[l])
                else:
                    uniq_reps, cond_indices = np.unique(replicatesIDs[l], return_index=True)
                    if -1 in uniq_reps:
                        cond_indices = cond_indices[1:]
                    conditions[l] = replicates[l][cond_indices]
        else:
            io.log(valresult[1])
            io.log("Terminating ...")
            raise Exception("Terminated by an invalid input argument.")

    # Validate normalisation
    valresult = val.validate_normalisation(normalise, X)
    if not valresult[0]:
        io.log(valresult[1])
        io.log("Terminating ...")
        raise Exception("Terminated by an invalid input argument.")


    # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering
    # low expression genes, and normalisation)
    io.log('2. Data pre-processing')
    (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \
        = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+',
                                           OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets)
    (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \
        pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None,
                      expressionValueThreshold=expressionValueThreshold, replacementVal=0.0,
                      atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue,
                      filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles)
    OGs = OGs[Iincluded]
    if MapNew is not None:
        MapNew = MapNew[Iincluded]

    # UNCLES and M-N plots
    io.log('3. Seed clusters production (the Bi-CoPaM method)')
    ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods,
                      Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic)
    io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)')
    mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight,
                             params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext,
                             ncores=ncores)

    # Post-processing
    ppmethod = 'tukey_sqrtSCG'
    if optimisation:
        io.log('5. Cluster optimisation and completion')
        if len(mnres.I) > 0 and sum(mnres.I) > 0:  # Otherwise, there are no clusters, so nothing to be corrected
            try:
                if ppmethod == 'weighted_outliers':
                    B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], stds, smallestClusterSize)
                elif ppmethod == 'tukey_sqrtSCG':
                    B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], smallestClusterSize,
                                                               tails=1, Q3s=Q3s)
                else:
                    raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod))
                B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM)
            except:
                io.logerror(sys.exc_info())
                io.log('\n* Failed to perform cluster optimisation and completion!\n'
                       '* Skipped cluster optimisation and completion!\n')
                B_corrected = mnres.B
        else:
            B_corrected = mnres.B
    else:
        io.log('5. Skipping cluster optimisation and completion')
        B_corrected = mnres.B


    # Output: Preparing output parameters as DataFrames
    if Map is None:
        Bout = op.clusters_B_as_dataframes(B_corrected, OGs, None)
    else:
        Bout, B_species = op.clusters_B_as_dataframes(B_corrected, OGs, MapNew)
    Xout = op.processed_X_as_dataframes(X_summarised_normalised, OGs, conditions)
    if input_is_one_dataset:
        Xout = Xout[0]

    # Output: Plot figures
    if showPlots:
        try:
            if np.shape(B_corrected)[1] > 0:
                graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, GDM=GDM,
                                   Cs='all', setPageToDefault=True, showPlots=showPlots, printToPDF=False)
        except:
            io.log('Error: could not generate clusters'' plots. Resuming the rest of steps ...')

    # Output: Prepare message to standard output and the summary then save the summary to a file and print the message
    summarymsg, endtime, timeconsumedtxt = \
        op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM,
                                      ures, mnres, B_corrected, starttime)
    io.log(summarymsg, addextrastick=False)

    io.deletetmpfile()

    return Bout, Xout, GDM
Beispiel #4
0
def preprocess(X, GDM, normalise=1000, replicatesIDs=None, flipSamples=None, expressionValueThreshold=10.0,
               replacementVal=0.0, atleastinconditions=1, atleastindatasets=1, absvalue=False, usereplacementval=False,
               filteringtype='raw', filterflat=True, params=None, datafiles=None):
    # Fixing parameters
    Xloc = ds.listofarrays2arrayofarrays(X)
    L = len(Xloc)
    if datafiles is None:
        if L == 1:
            datafiles = ['X']
        else:
            datafiles = np.array([], dtype=str)
            for i in range(L):
                datafiles = np.append(datafiles, 'X{0}'.format(i+1))
    if params is None:
        params = {}
    if replicatesIDs is None:
        replicatesIDsloc = [np.array([ii for ii in range(x.shape[1])]) for x in Xloc]
    else:
        replicatesIDsloc = ds.listofarrays2arrayofarrays(replicatesIDs)
        replicatesIDsloc = [np.array(x) for x in replicatesIDsloc]
    if flipSamples is None:
        flipSamplesloc = None
    else:
        flipSamplesloc = ds.listofarrays2arrayofarrays(flipSamples)
        flipSamplesloc = [np.array(x) for x in flipSamplesloc]
    # Revise if the if statement below is accurate!
    if not isinstance(normalise, (list, tuple, np.ndarray)):
        normaliseloc = [normalise if isinstance(normalise, (list, tuple, np.ndarray))
                        else [normalise] for i in range(L)]
        normaliseloc = ds.listofarrays2arrayofarrays(normaliseloc)
    else:
        normaliseloc = [nor if isinstance(nor, (list, tuple, np.ndarray)) else [nor] for nor in normalise]
        normaliseloc = ds.listofarrays2arrayofarrays(normaliseloc)

    # Get rid of nans by fixing
    Xproc = Xloc
    for l in range(L):
        Xproc[l] = fixnans(Xproc[l])

    # Prepare applied_norm dictionary before any normalisation takes place
    applied_norm = collec.OrderedDict(zip(datafiles, deepcopy(normaliseloc)))

    # Tell the user if any automatic normalisation is taking place
    allare1000 = True
    anyis1000 = False
    for l in range(L):
        if 1000 in normaliseloc[l]:
            anyis1000 = True
        else:
            allare1000 = False
    if allare1000:
        io.log(' - Automatic normalisation mode (default in v1.7.0+).')
        io.log('   Clust automatically normalises your dataset(s).')
        io.log('   To switch it off, use the `-n 0` option (not recommended).')
        io.log('   Check https://github.com/BaselAbujamous/clust for details.')
    elif anyis1000:
        io.log(' - Some datasets are not assigned normalisation codes in the provided')
        io.log('   normalisation file. Clust automatically identifies and applies the')
        io.log('   most suitable normalisation to them (default in v1.7.0+).')
        io.log('   If you don''t want clust to normalise them, assign each of them a')
        io.log('   normalisation code of 0 in the normalisation file.')
        io.log('   Check https://github.com/BaselAbujamous/clust for details.')

    # Quantile normalisation
    for l in range(L):
        if 101 in normaliseloc[l] or 1000 in normaliseloc[l]:
            Xproc[l] = normaliseSampleFeatureMat(Xproc[l], 101)[0]
            if 101 in normaliseloc[l]:
                i = np.argwhere(np.array(normaliseloc[l]) == 101)
                i = i[0][0]
                normaliseloc[l][i] = 0

    # Combine replicates and sort out flipped samples
    Xproc = combineReplicates(Xproc, replicatesIDsloc, flipSamplesloc)

    # Filter genes not exceeding the threshold
    (Xproc, GDMnew, Iincluded) = filterlowgenes(Xproc, GDM, expressionValueThreshold, replacementVal,
                                                atleastinconditions, atleastindatasets, absvalue,
                                                usereplacementval, filteringtype)

    # Normalise
    for l in range(L):
        (Xproc[l], codes) = normaliseSampleFeatureMat(Xproc[l], normaliseloc[l])
        if np.all(codes == normaliseloc[l]):
            applied_norm[datafiles[l]] = op.arraytostring(applied_norm[datafiles[l]], delim=' ', openbrac='',
                                                          closebrac='')
        else:
            applied_norm[datafiles[l]] = op.arraytostring(codes, delim=' ', openbrac='', closebrac='')

    if filterflat:
        io.log(' - Flat expression profiles filtered out (default in v1.7.0+).')
        io.log('   To switch it off, use the --no-fil-flat option (not recommended).')
        io.log('   Check https://github.com/BaselAbujamous/clust for details.')
        (Xproc, GDMnew, Iincluded) = filterFlat(Xproc, GDMnew, Iincluded)

    # Prepare params for the output
    params = dict(params, **{
        'normalise': normaliseloc,
        'replicatesIDs': replicatesIDs,
        'flipSamples': flipSamplesloc,
        'L': L
    })

    return Xproc, GDMnew, Iincluded, params, applied_norm