Python Cluster Examples, pyGCluster.Cluster Python Examples

Example #1

0

Show file

    def test_syntheticData(self):
        data = {}
        with open('../exampleFiles/syntheticDataset.csv', 'r') as input_file:
            for line in csv.DictReader(input_file):
                gene = line['Gene']
                if gene not in data.keys():
                    data[gene] = {}
                for n in range(8):
                    data[ gene ][ str(n) ] = ( float(line['{0}_mean'.format(n)]) , float(line['{0}_std'.format(n)])   )

        if not os.path.exists( WORKING_DIRECTORY ):
            os.mkdir( WORKING_DIRECTORY )
        else:
            print( '''
[ INFO ] Result folder already {0} exists.
[ INFO ] Old results might not be overwritten and lead to confusion ;)
[ INFO ] Check file creation date!'''.format( WORKING_DIRECTORY ) )

        # if os.path.exists( os.path.join( WORKING_DIRECTORY, PICKLE_FILENAME ) ):
        #     _ = input('Pickle already')
        # os.remove( )

        print( '[ INFO ] The results of the example script are saved into folder {0}.'.format( WORKING_DIRECTORY ) )

        TestCluster = pyGCluster.Cluster( data = data, working_directory = WORKING_DIRECTORY, verbosity_level = 2 )

        distance_metrices = [ 'euclidean' ]
        linkage_methods = [ 'complete' ]

        # cpus_2_use = 1
        # if multiprocessing.cpu_count() < cpus_2_use:
        #     cpus_2_use = multiprocessing.cpu_count()
        # print()
        TestCluster.resample(
                              distances = distance_metrices,
                              linkages = linkage_methods,
                              iter_max = 5000,
                              pickle_filename = PICKLE_FILENAME,
                              # cpus_2_use = cpus_2_use,
                              iter_till_the_end = True
        )

        mostfreq = TestCluster._get_most_frequent_clusters( top_X_clusters = 1)

        realIdsSet = set()
        for cluster in mostfreq:
            for index in cluster:
                try:
                    realIdsSet.add( TestCluster[ 'Identifiers' ][ index ] )
                except:
                    print(TestCluster[ 'Identifiers' ])
                    print(cluster)
                    print(index)
                    realIdsSet.add( TestCluster[ 'Identifiers' ][ index ] )

        self.assertEqual( realIdsSet , self.testSet )

        TestCluster.build_nodemap( min_cluster_size = 4, threshold_4_the_lowest_max_freq = 0.01)
        TestCluster.draw_community_expression_maps( min_value_4_expression_map = -40, max_value_4_expression_map = 40 , color_gradient = 'Spectral')
        TestCluster.draw_expression_profiles( min_value_4_expression_map = -40, max_value_4_expression_map = 40 )

Example #2

0

Show file

File: plotCommunities.py Project: niehues/pyGCluster

def main():

    threshold_4_the_lowest_max_freq = 0.005
    top_X_clusters = None

    for n in sys.argv[1:]:
        if "threshold_4_the_lowest_max_freq" in n:
            threshold_4_the_lowest_max_freq = float(n.split("=")[1])
        elif "top_X_clusters" in n:
            top_X_clusters = int(n.split("=")[1])
            threshold_4_the_lowest_max_freq = 0.0

    cluster = pyGCluster.Cluster()
    cluster.load(sys.argv[1])
    cluster['Working directory'] = os.path.dirname(sys.argv[1])

    cluster.build_nodemap(
        min_cluster_size=4,
        top_X_clusters=top_X_clusters,
        threshold_4_the_lowest_max_freq=threshold_4_the_lowest_max_freq)
    cluster.info()
    # print( cluster.keys() )
    cluster.draw_community_expression_maps(min_value_4_expression_map=-3,
                                           max_value_4_expression_map=3)
    cluster.draw_expression_profiles(min_value_4_expression_map=-3,
                                     max_value_4_expression_map=3)

Example #3

0

Show file

File: basicClusterHoehnerExampleData.py Project: niehues/pyGCluster

def main():
  pyGCluster_dir = os.path.split( sys.argv[ 0 ] )[ 0 ]
  ## parse data
  data = dict()
  with open( sys.argv[ 1 ] ) as fin:
    reader = csv.DictReader( fin, delimiter = ',' )
    conditions = set()
    for row in reader:
      if not conditions:
        conditions = set( [ _.split( '__' )[ 0 ] for _ in row.keys() ] ) - set( [ 'identifier' ] )
      data[ row[ 'identifier' ] ] = dict()
      for condition in conditions:
        mean = float( row[ '{0}__MEAN'.format( condition ) ] )
        std = float( row[ '{0}__STD'.format( condition ) ] )
        data[ row[ 'identifier' ] ][ condition ] = ( mean, std )

  working_dir = os.path.join( pyGCluster_dir, 'hoehner_example_run/' )
  if not os.path.exists( working_dir ):
      os.mkdir( working_dir )
  print( '[ INFO ] ... the results of the example script are saved in "{0}".\n'.format( working_dir ) )

  cpus_2_use = 4
  if multiprocessing.cpu_count() < cpus_2_use:
      print( '[ INFO ] 4 threads are not available -> re-sampling is performed with only {0} thread(s) (this increases calculation time approx. proportional).'.format( multiprocessing.cpu_count() ) )
      cpus_2_use = multiprocessing.cpu_count()
   
  cluster = pyGCluster.Cluster( data = data, working_directory = working_dir, verbosity_level = 2 )

  print( "[ INFO ] pyGCluster will evoke 4 threads (if possible), which each require approx. 1.5GB RAM. Please make sure you have enough RAM available (4 threads in all require approx. 6GB RAM)." )
  print( "[ INFO ] It will take approx. 2 hours to complete 250,000 iterations on 4 threads." )
  
  cluster.do_it_all( 
                    distances = [ 'euclidean', 'correlation' ], 
                    linkages = [ 'complete', 'average', 'ward' ], 
                    iter_max = 10000, 
                    cpus_2_use = cpus_2_use, 
                    min_value_4_expression_map = -3, 
                    max_value_4_expression_map = 3,
                    threshold_4_the_lowest_max_freq = 0.005
  )

Example #4

0

Show file

            'fastaID3' : {'1':(3.0,0.2),'2':(-3.0,0.4),'3':(-2.0,0.4),'4':(3.0,0.3),'5':(-2.0,0.3)},
            'fastaID4' : {'1':(3.0,0.4),'2':(-3.0,0.4),'3':(-1.0,0.4),'4':(4.0,0.4),'5':(-2.5,0.4)},
            'fastaID5' : {'1':(3.0,1.0),'2':(-3.0,0.4),'3':( 0.0,0.4),'4':(0.0,0.1),'5':(-3.5,0.4)},
            'fastaID6' : {'1':(3.0,2.0),'2':(-3.0,0.4),'3':( 1.0,0.4),'4':(1.5,0.2),'5':(-4.0,0.4)},
            'fastaID7' : {'1':(3.0,1.3),'2':(-3.0,0.4),'3':( 2.0,0.4),'4':(2.5,0.3),'5':(-4.0,0.5)},
        }

if __name__ == '__main__':
    # print( __doc__ )

    working_dir = './simpleExpressionMaps/'
    if not os.path.exists( working_dir ):
        os.mkdir( working_dir )
    print( '[ INFO ] ... the results of the example script are saved in "{0}"'.format( working_dir ) )

    cluster = pyGCluster.Cluster()
    for hm in cluster['Heat map']['Color Gradients'].keys():
        cluster.draw_expression_map(
        data = data,
        # additional_labels = None,
        min_value_4_expression_map                    = -4,
        max_value_4_expression_map                    = +4,
        expression_map_filename = os.path.join( working_dir , 'simpleExpressionMap_{0}.svg'.format( hm )),
        legend_filename         = os.path.join( working_dir , 'legend_hm_{0}.svg'.format( hm )),
        color_gradient          = hm,
        box_style               = 'classic'
        )
    for bs in cluster['Heat map']['SVG box styles'].keys():
        cluster.draw_expression_map(
            data = data,
            # additional_labels = None,

Example #5

0

Show file

def main(input_file=None, output_file=None, params=None):
    '''
    Arguments:
        input_file (str): input filename of csv which should be unified
        output_file (str): output filename of csv after unifying
        params (dict): params as passed by ursgal

    Please visit pyGCluster documentation for more information on this plotting
    function::

         * http://pygcluster.github.io/usage.html#clustered-data-visualization
         * color gradients
         * box styles


    Available parameters for heatmap::

        * heatmap_identifier_field_name defines the fieldname in the csv to
          appear directly right of the heatmap rows. Tipically the gene or
          protein name (Default: 'Protein')
        * heatmap_annotation_field_name defines the fieldname for an additional
          annotation in the csv to appear directly right of the object name in
          the heatmap rows. Tipically the gene or protein name
          (Default : 'map to uniprot')
        * heatmap_max_value defines the maximum value for the color scale
          (Default: 3)
        * heatmap_min_value defines the maximum value for the color scale
          (Default: -3)
        * heatmap_color_gradient defines the color gradient for the
          visualization (Default: 'Spectral')
        * heatmap_box_style defines the box style for the
          visualization (Default: 'classic')
        * heatmap_value_suffix is the suffix of the column name for columns
          holding a value (Default: '_mean')
        * heatmap_error_suffix is the suffix of the column name for columns
          holding the error to the value (Default: '_std')
        * heatmap_column_order defines the order of the columns for plotting

    Please do not forget to cite pyGCluster and Ursgal when using this node.

    '''
    csv_reader = csv.DictReader(open(input_file, 'r'))
    params['additional_labels'] = {}
    if params['heatmap_column_order'] == []:
        params['all_conditions'] = set()
        for fieldname in csv_reader.fieldnames:
            if fieldname.endswith(params['heatmap_value_suffix']):
                params['all_conditions'].add(
                    fieldname.replace(params['heatmap_value_suffix'],
                                      '')  # this tag could also go into params
                )
        params['all_conditions'] = sorted(list(params['all_conditions']))
    else:
        params['all_conditions'] = params['heatmap_column_order']
    plot_collector = {}
    identifiers = []
    forbidden_character_list = ['>', '<']
    for line_dict in csv_reader:
        line_name = line_dict[params['heatmap_identifier_field_name']]
        for character in forbidden_character_list:
            line_name = line_name.replace(character, '__')

        plot_collector[line_name] = {}

        for condition in params['all_conditions']:
            try:
                ratio = float(line_dict['{0}{1}'.format(
                    condition, params['heatmap_value_suffix'])])
                sd = float(line_dict['{0}{1}'.format(
                    condition, params['heatmap_error_suffix'])])
                plot_collector[line_name][condition] = (ratio, sd)
            except:
                continue
        identifiers.append(line_name)
        try:
            params['additional_labels'][line_name] = [
                ' ', line_dict[params['heatmap_annotation_field_name']]
            ]
        except:
            pass
    cluster = pyGCluster.Cluster()
    folder = os.path.dirname(output_file)
    cluster['Working directory'] = folder
    cluster.draw_expression_map(
        data=plot_collector,
        identifiers=identifiers,
        conditions=params['all_conditions'],
        additional_labels=params['additional_labels'],
        min_value_4_expression_map=params['heatmap_min_value'],
        max_value_4_expression_map=params['heatmap_max_value'],
        expression_map_filename=output_file,
        box_style=params['heatmap_box_style'],
        color_gradient=params['heatmap_color_gradient'],
    )

    return

Example #6

0

Show file

File: plot_pygcluster_heatmap_from_csv_1_0_0.py Project: StSchulze/ursgal

def main(input_file=None, output_file=None, params=None):
    """
    Arguments:
        input_file (str): input filename of csv which should be unified
        output_file (str): output filename of csv after unifying
        params (dict): params as passed by ursgal

    Note:
        Please do not forget to cite pyGCluster AND Ursgal when using this node.
        Thank you in advance!

    Please visit pyGCluster documentation for more information on this plotting
    function::

         * http://pygcluster.github.io/usage.html#clustered-data-visualization
         * color gradients
         * box styles


    Available parameters for heatmap::

        * heatmap_identifier_field_name defines the fieldname in the csv to
          appear directly right of the heatmap rows. Tipically the gene or
          protein name (Default: 'Protein')
        * heatmap_annotation_field_name defines the fieldname for an additional
          annotation in the csv to appear directly right of the object name in
          the heatmap rows. Tipically the gene or protein name
          (Default : 'map to uniprot')
        * heatmap_max_value defines the maximum value for the color scale
          (Default: 3)
        * heatmap_min_value defines the maximum value for the color scale
          (Default: -3)
        * heatmap_color_gradient defines the color gradient for the
          visualization (Default: 'Spectral')
        * heatmap_box_style defines the box style for the
          visualization (Default: 'classic')
        * heatmap_value_suffix is the suffix of the column name for columns
          holding a value (Default: '_mean')
        * heatmap_error_suffix is the suffix of the column name for columns
          holding the error to the value (Default: '_std')
        * heatmap_column_positions defines the order of the columns for plotting


    Note:

        Use of force = True is recommended to cover changes in the csv input
        file.
        Default value suffix of the column name is '_mean' and '_std' for the
        error estimate.
        Please refer to the documentation for further details on parameters.

    """
    csv_reader = csv.DictReader(open(input_file, "r"))
    # pprint.pprint(params)
    params["additional_labels"] = {}
    if params["heatmap_column_positions"] == {}:
        params["all_conditions"] = set()
        for fieldname in csv_reader.fieldnames:
            if fieldname.endswith(params["heatmap_value_suffix"]):
                params["all_conditions"].add(
                    fieldname.replace(
                        params["heatmap_value_suffix"], ""
                    )  # this tag could also go into params
                )
        params["all_conditions"] = sorted(list(params["all_conditions"]))
    else:
        params["all_conditions"] = [
            params["heatmap_column_positions"][k]
            for k in sorted(params["heatmap_column_positions"].keys())
        ]
    plot_collector = {}
    identifiers = []
    forbidden_character_list = [">", "<"]
    for line_dict in csv_reader:
        line_name = line_dict[params["heatmap_identifier_field_name"]]
        for character in forbidden_character_list:
            line_name = line_name.replace(character, "__")

        plot_collector[line_name] = {}

        for condition in params["all_conditions"]:
            try:
                ratio = float(
                    line_dict[
                        "{0}{1}".format(condition, params["heatmap_value_suffix"])
                    ]
                )
                sd = float(
                    line_dict[
                        "{0}{1}".format(condition, params["heatmap_error_suffix"])
                    ]
                )
                plot_collector[line_name][condition] = (ratio, sd)
            except:
                continue
        identifiers.append(line_name)
        if params["heatmap_annotation_field_name"] in line_dict.keys():
            annotation = line_dict[params["heatmap_annotation_field_name"]]
            for character in forbidden_character_list:
                annotation = annotation.replace(character, "__")
            params["additional_labels"][line_name] = [" ", annotation]

    cluster = pyGCluster.Cluster()
    folder = os.path.dirname(output_file)
    cluster["Working directory"] = folder
    cluster.draw_expression_map(
        data=plot_collector,
        identifiers=identifiers,
        conditions=params["all_conditions"],
        additional_labels=params["additional_labels"],
        min_value_4_expression_map=params["heatmap_min_value"],
        max_value_4_expression_map=params["heatmap_max_value"],
        expression_map_filename=output_file,
        box_style=params["heatmap_box_style"],
        color_gradient=params["heatmap_color_gradient"],
    )

    return

Example #7

0

Show file

def main():
    pyGCluster_dir = os.path.split(sys.argv[0])[0]
    ## parse data
    data = dict()
    with open(sys.argv[1]) as fin:
        reader = csv.DictReader(fin, delimiter=',')
        conditions = set()
        for row in reader:
            if not conditions:
                conditions = set([_.split('__')[0]
                                  for _ in row.keys()]) - set(['identifier'])
            data[row['identifier']] = dict()
            for condition in conditions:
                mean = float(row['{0}__MEAN'.format(condition)])
                std = float(row['{0}__STD'.format(condition)])
                data[row['identifier']][condition] = (mean, std)

    working_dir = os.path.join(pyGCluster_dir, 'hoehner_example_run/')
    print(
        '[ INFO ] ... the results of the example script are saved in "{0}".\n'.
        format(working_dir))
    ## initialize pyGCluster
    if not os.path.exists(working_dir):
        os.mkdir(working_dir)
    TestCluster = pyGCluster.Cluster(data=data,
                                     working_directory=os.path.join(
                                         pyGCluster_dir,
                                         'hoehner_example_run/'),
                                     verbosity_level=2)
    print(
        "[ INFO ] pyGCluster will evoke 4 threads (if possible), which each require approx. 1.5GB RAM. Please make sure you have enough RAM available (4 threads in all require approx. 6GB RAM)."
    )
    print(
        "[ INFO ] It will take approx. 2 hours to complete 250,000 iterations on 4 threads."
    )
    ## start the re-sampling process ... if 4 threads are available, this may take X hours and Y GB RAM.
    distance_metrices = ['correlation', 'euclidean']
    linkage_methods = ['complete', 'average', 'ward']
    print('[ INFO ] performing re-sampling ...')
    cpus_2_use = 4
    if multiprocessing.cpu_count() < cpus_2_use:
        print(
            '[ INFO ] 4 threads are not available -> re-sampling is performed with only {0} thread(s) (this increases calculation time approx. proportional).'
            .format(multiprocessing.cpu_count()))
        cpus_2_use = multiprocessing.cpu_count()
    print()
    TestCluster.resample(distances=distance_metrices,
                         linkages=linkage_methods,
                         iter_max=250000,
                         pickle_filename='example.pkl',
                         cpus_2_use=cpus_2_use)
    # after re-sampling, the results are saved in a file given by "pickle_filename"

    ## plot a heat map showing the frequencies among the distance-linkage combinations (DLCs) of the first 33 clusters:
    TestCluster.plot_clusterfreqs(min_cluster_size=4, top_X_clusters=33)

    ## create and plot communities
    TestCluster.build_nodemap(
        min_cluster_size=4, threshold_4_the_lowest_max_freq=0.005
    )  # create communities from a set of the 1 promille (or more) clusters
    TestCluster.write_dot(
        filename='hoehner_1promilleclusters_minsize4.dot',
        min_value_4_expression_map=-3,
        max_value_4_expression_map=3,
        color_gradient='1337'
    )  # creates DOT file of the node map showing the cluster composition of the communities

    TestCluster.draw_community_expression_maps(
        min_value_4_expression_map=-3,
        max_value_4_expression_map=3,
        color_gradient='1337'
    )  # draw a heat map showing the protein composition of each community
    TestCluster.draw_expression_profiles(
        min_value_4_expression_map=-3, max_value_4_expression_map=3
    )  # draw a plot showing the expression patterns of the proteins (with standard deviation) inside each community

    ## save to be able to continue analysis at a later timepoint
    TestCluster.save(filename='example_1promille_communities.pkl')
    #TestCluster.load( 'example_1percent_communities.pkl' )

    # create CSV containing the protein composition of communities
    # => two cols: community ID -> identifier
    with open(
            os.path.join(TestCluster['Working directory'],
                         'community2protein.csv'), 'w') as fout:
        writer = csv.DictWriter(fout,
                                fieldnames=['community ID', 'identifier'])
        writer.writeheader()
        _max_level = max(
            [_communityID[1] for _communityID in TestCluster['Communities']])
        for cluster in TestCluster._get_levelX_clusters(level=_max_level):
            _communityID = (cluster, _max_level)
            for protein_index in TestCluster['Communities'][_communityID][
                    'index 2 obCoFreq dict']:
                protein = TestCluster['Identifiers'][protein_index]
                name = '{0}-{1}'.format(
                    TestCluster['Communities'][_communityID]['cluster ID'],
                    _max_level)
                writer.writerow({'community ID': name, 'identifier': protein})
    print('[ INFO ] test script successfully executed.')