Ejemplo n.º 1
0
 def swap_log(swap, error=True):
     sinfo = []
     for l in swap.split('\n'):
         if l == '':
             continue
         sinfo.append(l)
     for o in sinfo:
         if error:
             print_error(o)
         else:
             print_info(o)
     return
Ejemplo n.º 2
0
    def __init__(
        self,
        dfx,
        metric='correlation',
        info_distance=None,
    ):
        """
        paramters
        -----------------
        dfx: pandas DataFrame
        metric: {'cosine', 'correlation', 'euclidean', 'jaccard', 'hamming', 'dice'}, default: 'correlation', measurement of feature distance
        info_distance: a vector-form distance vector of the feature points, shape should be: (n*(n-1)/2), where n is the number of the features
        
        """

        assert type(
            dfx
        ) == pd.core.frame.DataFrame, 'input dfx mush be pandas DataFrame!'
        super().__init__()

        self.metric = metric
        self.isfit = False
        self.alist = dfx.columns.tolist()
        self.ftype = 'feature points'
        self.cluster_flag = False
        m, n = dfx.shape
        info_distance_length = int(n * (n - 1) / 2)

        ## calculating distance
        if np.array(info_distance).any():
            assert len(
                info_distance
            ) == info_distance_length, 'shape of info_distance must be (%s,)' % info_distance_length
            print_info('skip to calculate the distance')
            self.info_distance = np.array(info_distance)

        else:
            print_info('Calculating distance ...')
            D = calculator.pairwise_distance(dfx.values,
                                             n_cpus=16,
                                             method=metric)
            D = np.nan_to_num(D, copy=False)
            D_ = squareform(D)
            self.info_distance = D_.clip(0, np.inf)

        ## statistic info
        S = summary.Summary(n_jobs=10)
        res = []
        for i in tqdm(range(dfx.shape[1]), ascii=True):
            r = S._statistics_one(dfx.values, i)
            res.append(r)
        dfs = pd.DataFrame(res, index=self.alist)
        self.info_scale = dfs
Ejemplo n.º 3
0
def load_config(ftype='descriptor', metric='cosine'):

    name = '%s_%s.cfg.gzip' % (ftype, metric)

    dirf = os.path.dirname(__file__)
    filename = os.path.join(dirf, name)

    if os.path.exists(filename):
        df = pd.read_pickle(filename, compression='gzip')
    else:

        name = '%s_%s.cfg.gzip' % (ftype, metric)
        filename = os.path.join(dirf, name)

        try:
            print('try to down it from Google drive ...')
            url = googleids.get(name)
            print_info('downloading config file from google drive: %s' % url)
            filename = gdown.download(url, filename, quiet=False)
            print_info('finished...')

        except:
            print(
                'Max retries exceeded for Google Drive, will try to down it from bidd.group...'
            )
            url = biddids.get(name)
            print_info('downloading config file from bidd website: %s' % url)
            filename = gdown.download(url, filename, quiet=False)
            print_info('finished...')

        df = pd.read_pickle(filename, compression='gzip')
    return df
Ejemplo n.º 4
0
def MultiProcessUnorderedBarRun(func, deal_list, n_cpus=None):
    if n_cpus == None:
        N_CPUS = cpu_count()
    else:
        N_CPUS = int(n_cpus)
    print_info('the number of process is %s' % N_CPUS)

    p = Pool(N_CPUS)
    res_list = []
    with pbar(total=len(deal_list), ascii=True) as pb:
        for res in p.imap_unordered(func, deal_list):
            pb.update(1)
            res_list.append(res)
    p.close()
    p.join()
    return res_list
Ejemplo n.º 5
0
def RunCmd(cmd):
    '''
    input:
        cmd: str
    output:
        status: int, 0 for success
        stdout: str
        stderr: str
        
    '''
    print_info('run command : %s' % cmd)

    def swap_log(swap, error=True):
        sinfo = []
        for l in swap.split('\n'):
            if l == '':
                continue
            sinfo.append(l)
        for o in sinfo:
            if error:
                print_error(o)
            else:
                print_info(o)
        return

    output = subprocess.run(cmd,
                            shell=True,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            universal_newlines=True)
    status = output.returncode
    stdout = output.stdout
    stderr = output.stderr

    if status != 0:
        if output.stdout:
            swap_log(output.stdout, error=True)
        if output.stderr:
            swap_log(output.stderr, error=True)
    else:
        if output.stdout:
            swap_log(output.stdout, error=False)
    #return status

    return status, stdout, stderr
Ejemplo n.º 6
0
def MultiExecutorRun(func, deal_list, n_cpus=4, tqdm_args={'unit': 'one'}):
    '''
    input:
        func: function to do with each element in the deal_list
        deal_list: list to be done
        n_cpus: use the number of cpus
        tqdm_args: args for tqdm
    output:
        list of the return value for each func
    '''
    lst = list(deal_list)
    series = pd.Series(lst)

    futures = _executor(func, series, n_cpus=n_cpus)
    args = {
        'total': len(deal_list),
        'unit': 'one',
        'ascii': True,
        'unit_scale': True,
        'leave': True
    }
    args.update(tqdm_args)

    print_info(args)

    results = []
    indexs = []
    for f in tqdm(as_completed(futures), **args):
        #print(f)
        idx, result = f.result()
        indexs.append(idx)
        results.append(result)

    res = pd.Series(results, index=indexs)
    #sort unordered result
    ordered_lst = res.sort_index().tolist()
    return ordered_lst
Ejemplo n.º 7
0
    def __init__(self, 
                 dfx,
                 metric = 'correlation' 
                ):
        
        """
        paramters
        -----------------
        dfx: pandas DataFrame
        metric: {'cosine', 'correlation', 'euclidean', 'jaccard', 'hamming', 'dice'}, default: 'correlation', measurement of feature distance
        
        """
        
        assert type(dfx) == pd.core.frame.DataFrame, 'input dfx mush be pandas DataFrame!'
        super().__init__()

        self.metric = metric
        self.isfit = False
        self.alist = dfx.columns.tolist()
        self.ftype = 'feature points'
        self.cluster_flag = False
        
        ## calculating distance
        print_info('Calculating distance ...')
        D = calculator.pairwise_distance(dfx.values, n_cpus=16, method=metric)
        D = np.nan_to_num(D,copy=False)
        D_ = squareform(D)
        self.info_distance = D_.clip(0, np.inf)

        ## statistic info
        S = summary.Summary(n_jobs = 10)
        res= []
        for i in tqdm(range(dfx.shape[1]), ascii=True):
            r = S._statistics_one(dfx.values, i)
            res.append(r)
        dfs = pd.DataFrame(res, index = self.alist)
        self.info_scale = dfs
Ejemplo n.º 8
0
def ImapUnorder(processor,
                iterator,
                max_workers=10,
                fail_in_file='./filed.lst'):
    '''
    processor: fuction
    iterator: list or iterator,each element should be a tuple or dict, so that data can be used as ordered 
    '''
    with ProcessPoolExecutor(max_workers=max_workers) as executor:

        with open(fail_in_file, 'w+') as f:
            futures = {
                executor.submit(processor, IdPlusSmile): IdPlusSmile
                for IdPlusSmile in iterator
            }
            success, _ = wait(futures)
            with pbar(total=len(futures)) as pb:
                for i in success:
                    IdPlusSmile = futures[i]
                    print_info('deal ' + str(IdPlusSmile))
                    try:
                        data_dict = i.result()
                        yield data_dict
                    except Exception as exc:
                        print_warn(
                            'because of the process is dead, input: %s is fialed when deal with %s: %s, so we will deal it automatically'
                            % (IdPlusSmile, processor, exc))

                        try:
                            yield processor(IdPlusSmile)
                        except:
                            f.write(str(IdPlusSmile) + '\n')
                            print_error(
                                ' input: %s is fialed when deal with %s: %s' %
                                (IdPlusSmile, processor, exc))
                    pb.update(1)
Ejemplo n.º 9
0
def MultiProcessRun(func, deal_list, n_cpus=None):
    '''
    input:
        func: function to do with each element in the deal_list
        deal_list: list to be done
        n_cpus: use the number of cpus
    output:
        list of the return result for each func
    '''

    #round_c = [deal_list[i:i+batch_size] for i  in range(0, len(deal_list), batch_size)]
    #mata thinking: https://my.oschina.net/leejun2005/blog/203148
    if n_cpus == None:
        N_CPUS = cpu_count()
    else:
        N_CPUS = int(n_cpus)

    print_info('the number of process is %s' % N_CPUS)

    pool = Pool(N_CPUS)
    a = pool.map(func, deal_list)
    pool.close()
    pool.join()
    return a
Ejemplo n.º 10
0
    def fit(self, 
            method = 'umap', min_dist = 0.1, n_neighbors = 30,
            verbose = 2, random_state = 1, **kwargs): 
        """
        parameters
        -----------------
        method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D
        kwargs: the extra parameters for the conresponding method
        """
        if 'n_components' in kwargs.keys():
            kwargs.pop('n_components')
            
        ## embedding  into a 2d 
        assert method in ['tsne', 'umap', 'mds'], 'no support such method!'
        
        self.method = method
        
        ## 2d embedding first
        self._fit_embedding(method = method,
                            n_neighbors = n_neighbors,
                            random_state = random_state,
                            min_dist = min_dist, 
                            verbose = verbose,
                            n_components = 2, **kwargs)

        
        if self.fmap_type == 'scatter':
            ## naive scatter algorithm
            print_info('Applying naive scatter feature map...')
            self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels')
            print_info('Finished')
            
        else:
            ## linear assignment algorithm 
            print_info('Applying grid feature map(assignment), this may take several minutes(1~30 min)')
            self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels')
            print_info('Finished')
        
        ## fit flag
        self.isfit = True
        self.fmap_shape = self._S.fmap_shape
Ejemplo n.º 11
0
def plot_grid(molmap, htmlpath='./', htmlname=None):
    '''
    molmap: the object of molmap
    htmlpath: the figure path
    '''

    if not os.path.exists(htmlpath):
        os.makedirs(htmlpath)

    title = 'Assignment of %s by %s emmbedding result' % (molmap.ftype,
                                                          molmap.method)
    subtitle = 'number of %s: %s, metric method: %s' % (
        molmap.ftype, len(molmap.flist), molmap.metric)

    name = '%s_%s_%s_%s_%s' % (molmap.ftype, len(
        molmap.flist), molmap.metric, molmap.method, 'molmap')

    if htmlname:
        name = name = htmlname + '_' + name

    filename = os.path.join(htmlpath, name)
    print_info('generate file: %s' % filename)

    m, n = molmap.fmap_shape
    colormaps = molmap.extract.colormaps
    position = np.zeros(molmap.fmap_shape, dtype='O').reshape(m * n, )
    position[molmap._S.col_asses] = molmap.flist
    position = position.reshape(m, n)

    x = []
    for i in range(n):
        x.extend([i] * m)

    y = list(range(m)) * n

    v = position.reshape(m * n, order='f')

    df = pd.DataFrame(list(zip(x, y, v)), columns=['x', 'y', 'v'])
    bitsinfo = molmap.extract.bitsinfo
    subtypedict = bitsinfo.set_index('IDs')['Subtypes'].to_dict()
    subtypedict.update({0: 'NaN'})
    df['Subtypes'] = df.v.map(subtypedict)
    df['colors'] = df['Subtypes'].map(colormaps)

    H = Highchart(width=1000, height=850)
    H.set_options('chart', {'type': 'heatmap', 'zoomType': 'xy'})
    H.set_options('title', {'text': title})
    H.set_options('subtitle', {'text': subtitle})

    #     H.set_options('xAxis', {'title': '',
    #                             'min': 0, 'max': molmap.fmap_shape[1]-1,
    #                             'allowDecimals':False,
    #                             'labels':{'style':{'fontSize':20}}})

    #     H.set_options('yAxis', {'title': '', 'tickPosition': 'inside',
    #                             'min': 0, 'max': molmap.fmap_shape[0]-1,
    #                             'reversed': True,
    #                             'allowDecimals':False,
    #                             'labels':{'style':{'fontSize':20}}})

    H.set_options(
        'xAxis', {
            'title': None,
            'min': 0,
            'max': molmap.fmap_shape[1],
            'startOnTick': False,
            'endOnTick': False,
            'allowDecimals': False,
            'labels': {
                'style': {
                    'fontSize': 20
                }
            }
        })

    H.set_options(
        'yAxis', {
            'title': {
                'text': ' ',
                'style': {
                    'fontSize': 20
                }
            },
            'startOnTick': False,
            'endOnTick': False,
            'gridLineWidth': 0,
            'reversed': True,
            'min': 0,
            'max': molmap.fmap_shape[0],
            'allowDecimals': False,
            'labels': {
                'style': {
                    'fontSize': 20
                }
            }
        })

    H.set_options(
        'legend', {
            'align': 'right',
            'layout': 'vertical',
            'margin': 1,
            'verticalAlign': 'top',
            'y': 60,
            'symbolHeight': 12,
            'floating': False,
        })

    H.set_options('tooltip', {
        'headerFormat': '<b>{series.name}</b><br>',
        'pointFormat': '{point.v}'
    })

    H.set_options('plotOptions', {'series': {'turboThreshold': 5000}})

    for subtype, color in colormaps.items():
        dfi = df[df['Subtypes'] == subtype]
        if len(dfi) == 0:
            continue
        H.add_data_set(
            dfi.to_dict('records'),
            'heatmap',
            name=subtype,
            color=color,  #dataLabels = {'enabled': True, 'color': '#000000'}
        )
    H.save_file(filename)
    print_info('save html file to %s' % filename)

    return df, H
Ejemplo n.º 12
0
def plot_scatter(molmap, htmlpath='./', htmlname=None, radius=3):
    '''
    molmap: the object of molmap
    htmlpath: the figure path, not include the prefix of 'html'
    htmlname: the name 
    radius: int, defaut:3, the radius of scatter dot
    '''

    title = '2D emmbedding of %s based on %s method' % (molmap.ftype,
                                                        molmap.method)
    subtitle = 'number of %s: %s, metric method: %s' % (
        molmap.ftype, len(molmap.flist), molmap.metric)
    name = '%s_%s_%s_%s_%s' % (molmap.ftype, len(
        molmap.flist), molmap.metric, molmap.method, 'scatter')

    if not os.path.exists(htmlpath):
        os.makedirs(htmlpath)

    if htmlname:
        name = htmlname + '_' + name

    filename = os.path.join(htmlpath, name)
    print_info('generate file: %s' % filename)

    xy = molmap.embedded.embedding_
    colormaps = molmap.extract.colormaps

    df = pd.DataFrame(xy, columns=['x', 'y'])
    bitsinfo = molmap.extract.bitsinfo.set_index('IDs')
    df = df.join(bitsinfo.loc[molmap.flist].reset_index())
    df['colors'] = df['Subtypes'].map(colormaps)

    H = Highchart(width=1000, height=850)
    H.set_options('chart', {'type': 'scatter', 'zoomType': 'xy'})
    H.set_options('title', {'text': title})
    H.set_options('subtitle', {'text': subtitle})
    H.set_options(
        'xAxis', {
            'title': {
                'enabled': True,
                'text': 'X',
                'style': {
                    'fontSize': 20
                }
            },
            'labels': {
                'style': {
                    'fontSize': 20
                }
            },
            'gridLineWidth': 1,
            'startOnTick': True,
            'endOnTick': True,
            'showLastLabel': True
        })

    H.set_options(
        'yAxis', {
            'title': {
                'text': 'Y',
                'style': {
                    'fontSize': 20
                }
            },
            'labels': {
                'style': {
                    'fontSize': 20
                }
            },
            'gridLineWidth': 1,
        })

    #     H.set_options('legend', {'layout': 'horizontal','verticalAlign': 'top','align':'right','floating': False,
    #                              'backgroundColor': "(Highcharts.theme && Highcharts.theme.legendBackgroundColor) || '#FFFFFF'",
    #                              'borderWidth': 1})

    H.set_options(
        'legend', {
            'align': 'right',
            'layout': 'vertical',
            'margin': 1,
            'verticalAlign': 'top',
            'y': 40,
            'symbolHeight': 12,
            'floating': False,
        })

    H.set_options(
        'plotOptions', {
            'scatter': {
                'marker': {
                    'radius': radius,
                    'states': {
                        'hover': {
                            'enabled': True,
                            'lineColor': 'rgb(100,100,100)'
                        }
                    }
                },
                'states': {
                    'hover': {
                        'marker': {
                            'enabled': False
                        }
                    }
                },
                'tooltip': {
                    'headerFormat': '<b>{series.name}</b><br>',
                    'pointFormat': '{point.IDs}'
                }
            },
            'series': {
                'turboThreshold': 5000
            }
        })

    for subtype, color in colormaps.items():
        dfi = df[df['Subtypes'] == subtype]
        if len(dfi) == 0:
            continue

        data = dfi.to_dict('records')
        H.add_data_set(data, 'scatter', subtype, color=color)
    H.save_file(filename)
    print_info('save html file to %s' % filename)
    return df, H
Ejemplo n.º 13
0
    def fit(self, 
            feature_group_list = [],
            cluster_channels = 3,
            var_thr = -1, 
            split_channels = True, 
            fmap_type = 'grid',  
            fmap_shape = None, 
            emb_method = 'umap', 
            min_dist = 0.1, 
            n_neighbors = 15,
            verbose = 2, 
            random_state = 32,
            group_color_dict  = {},
            lnk_method = 'complete',
            **kwargs): 
        """
        parameters
        -----------------
        feature_group_list: list of the group name for each feature point
        cluster_channels: int, number of the channels(clusters) if feature_group_list is empty
        var_thr: float, defalt is -1, meaning that feature will be included only if the conresponding variance larger than this value. Since some of the feature has pretty low variances, we can remove them by increasing this threshold
        split_channels: bool, if True, outputs will split into various channels using the types of feature
        fmap_type:{'scatter', 'grid'}, default: 'gird', if 'scatter', will return a scatter mol map without an assignment to a grid
        fmap_shape: None or tuple, size of molmap, only works when fmap_type is 'scatter', if None, the size of feature map will be calculated automatically
        emb_method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D
        group_color_dict: dict of the group colors, keys are the group names, values are the colors
        lnk_method: {'complete', 'average', 'single', 'weighted', 'centroid'}, linkage method
        kwargs: the extra parameters for the conresponding embedding method
        """
            
        if 'n_components' in kwargs.keys():
            kwargs.pop('n_components')
            
            
        ## embedding  into a 2d 
        assert emb_method in ['tsne', 'umap', 'mds'], 'No Such Method Supported: %s' % emb_method
        assert fmap_type in ['scatter', 'grid'], 'No Such Feature Map Type Supported: %s'   % fmap_type     
        self.var_thr = var_thr
        self.split_channels = split_channels
        self.fmap_type = fmap_type
        self.fmap_shape = fmap_shape
        self.emb_method = emb_method
        self.lnk_method = lnk_method
        if fmap_shape != None:
            assert len(fmap_shape) == 2, "fmap_shape must be a tuple with two elements!"
        # flist and distance
        flist = self.info_scale[self.info_scale['var'] > self.var_thr].index.tolist()
        
        dfd = pd.DataFrame(squareform(self.info_distance),
                           index=self.alist,
                           columns=self.alist)
        dist_matrix = dfd.loc[flist][flist]
        self.flist = flist
        
        self.x_mean = self.info_scale['mean'].values
        self.x_std =  self.info_scale['std'].values
        
        self.x_min = self.info_scale['min'].values
        self.x_max = self.info_scale['max'].values
        
   
                
        #bitsinfo
        dfb = pd.DataFrame(self.alist, columns = ['IDs'])
        if feature_group_list != []:
            
            self.cluster_flag = False
            
            assert len(feature_group_list) == len(self.alist), "the length of the input group list is not equal to length of the feature list"
            self.cluster_channels = len(set(feature_group_list))
            self.feature_group_list = feature_group_list
            
            dfb['Subtypes'] = feature_group_list
            
            if set(feature_group_list).issubset(set(group_color_dict.keys())):
                self.group_color_dict = group_color_dict
                dfb['colors'] = dfb['Subtypes'].map(group_color_dict)
            else:
                unique_types = dfb['Subtypes'].unique()
                color_list = sns.color_palette("hsv", len(unique_types)).as_hex()
                group_color_dict = dict(zip(unique_types, color_list))
                dfb['colors'] = dfb['Subtypes'].map(group_color_dict)
                self.group_color_dict = group_color_dict
        else:
            
            self.cluster_channels = cluster_channels
            print_info('applying hierarchical clustering to obtain group information ...')
            self.cluster_flag = True
            
            Z = linkage(squareform(dfd.values),  lnk_method)
            labels = fcluster(Z, cluster_channels, criterion='maxclust')
            
            feature_group_list = ['cluster_%s' % str(i).zfill(2) for i in labels]
            dfb['Subtypes'] = feature_group_list
            dfb = dfb.sort_values('Subtypes')
            unique_types = dfb['Subtypes'].unique()
            
            if not set(unique_types).issubset(set(group_color_dict.keys())):
                color_list = sns.color_palette("hsv", len(unique_types)).as_hex()
                group_color_dict = dict(zip(unique_types, color_list))
            
            dfb['colors'] = dfb['Subtypes'].map(group_color_dict)
            self.group_color_dict = group_color_dict           
            self.Z = Z
            self.feature_group_list = feature_group_list
            

        self.bitsinfo = dfb
        colormaps = dfb.set_index('Subtypes')['colors'].to_dict()
        colormaps.update({'NaN': '#000000'})
        self.colormaps = colormaps

        
        if fmap_type == 'grid':
            S = Scatter2Grid()
        else:
            if fmap_shape == None:
                N = len(self.flist)
                l = np.int(np.sqrt(N))*2
                fmap_shape = (l, l)                
            S = Scatter2Array(fmap_shape)
        
        self._S = S

        ## 2d embedding first
        embedded = self._fit_embedding(dist_matrix,
                                       method = emb_method,
                                       n_neighbors = n_neighbors,
                                       random_state = random_state,
                                       min_dist = min_dist, 
                                       verbose = verbose,
                                       n_components = 2, **kwargs)
        
        self.embedded = embedded 
        
        df = pd.DataFrame(embedded.embedding_, index = self.flist,columns=['x', 'y'])
        typemap = self.bitsinfo.set_index('IDs')
        df = df.join(typemap)
        df['Channels'] = df['Subtypes']
        self.df_embedding = df
      
        if self.fmap_type == 'scatter':
            ## naive scatter algorithm
            print_info('Applying naive scatter feature map...')
            self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels')
            print_info('Finished')
            
        else:
            ## linear assignment algorithm 
            print_info('Applying grid feature map(assignment), this may take several minutes(1~30 min)')
            self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels')
            print_info('Finished')
        
        ## fit flag
        self.isfit = True
        if self.fmap_shape == None:
            self.fmap_shape = self._S.fmap_shape        
        
        else:
            m, n = self.fmap_shape
            p, q = self._S.fmap_shape
            assert (m >= p) & (n >=q), "fmap_shape's width must >= %s, height >= %s " % (p, q)
            
        return self