def test_syntheticData(self): data = {} with open('../exampleFiles/syntheticDataset.csv', 'r') as input_file: for line in csv.DictReader(input_file): gene = line['Gene'] if gene not in data.keys(): data[gene] = {} for n in range(8): data[ gene ][ str(n) ] = ( float(line['{0}_mean'.format(n)]) , float(line['{0}_std'.format(n)]) ) if not os.path.exists( WORKING_DIRECTORY ): os.mkdir( WORKING_DIRECTORY ) else: print( ''' [ INFO ] Result folder already {0} exists. [ INFO ] Old results might not be overwritten and lead to confusion ;) [ INFO ] Check file creation date!'''.format( WORKING_DIRECTORY ) ) # if os.path.exists( os.path.join( WORKING_DIRECTORY, PICKLE_FILENAME ) ): # _ = input('Pickle already') # os.remove( ) print( '[ INFO ] The results of the example script are saved into folder {0}.'.format( WORKING_DIRECTORY ) ) TestCluster = pyGCluster.Cluster( data = data, working_directory = WORKING_DIRECTORY, verbosity_level = 2 ) distance_metrices = [ 'euclidean' ] linkage_methods = [ 'complete' ] # cpus_2_use = 1 # if multiprocessing.cpu_count() < cpus_2_use: # cpus_2_use = multiprocessing.cpu_count() # print() TestCluster.resample( distances = distance_metrices, linkages = linkage_methods, iter_max = 5000, pickle_filename = PICKLE_FILENAME, # cpus_2_use = cpus_2_use, iter_till_the_end = True ) mostfreq = TestCluster._get_most_frequent_clusters( top_X_clusters = 1) realIdsSet = set() for cluster in mostfreq: for index in cluster: try: realIdsSet.add( TestCluster[ 'Identifiers' ][ index ] ) except: print(TestCluster[ 'Identifiers' ]) print(cluster) print(index) realIdsSet.add( TestCluster[ 'Identifiers' ][ index ] ) self.assertEqual( realIdsSet , self.testSet ) TestCluster.build_nodemap( min_cluster_size = 4, threshold_4_the_lowest_max_freq = 0.01) TestCluster.draw_community_expression_maps( min_value_4_expression_map = -40, max_value_4_expression_map = 40 , color_gradient = 'Spectral') TestCluster.draw_expression_profiles( min_value_4_expression_map = -40, max_value_4_expression_map = 40 )
def main(): threshold_4_the_lowest_max_freq = 0.005 top_X_clusters = None for n in sys.argv[1:]: if "threshold_4_the_lowest_max_freq" in n: threshold_4_the_lowest_max_freq = float(n.split("=")[1]) elif "top_X_clusters" in n: top_X_clusters = int(n.split("=")[1]) threshold_4_the_lowest_max_freq = 0.0 cluster = pyGCluster.Cluster() cluster.load(sys.argv[1]) cluster['Working directory'] = os.path.dirname(sys.argv[1]) cluster.build_nodemap( min_cluster_size=4, top_X_clusters=top_X_clusters, threshold_4_the_lowest_max_freq=threshold_4_the_lowest_max_freq) cluster.info() # print( cluster.keys() ) cluster.draw_community_expression_maps(min_value_4_expression_map=-3, max_value_4_expression_map=3) cluster.draw_expression_profiles(min_value_4_expression_map=-3, max_value_4_expression_map=3)
def main(): pyGCluster_dir = os.path.split( sys.argv[ 0 ] )[ 0 ] ## parse data data = dict() with open( sys.argv[ 1 ] ) as fin: reader = csv.DictReader( fin, delimiter = ',' ) conditions = set() for row in reader: if not conditions: conditions = set( [ _.split( '__' )[ 0 ] for _ in row.keys() ] ) - set( [ 'identifier' ] ) data[ row[ 'identifier' ] ] = dict() for condition in conditions: mean = float( row[ '{0}__MEAN'.format( condition ) ] ) std = float( row[ '{0}__STD'.format( condition ) ] ) data[ row[ 'identifier' ] ][ condition ] = ( mean, std ) working_dir = os.path.join( pyGCluster_dir, 'hoehner_example_run/' ) if not os.path.exists( working_dir ): os.mkdir( working_dir ) print( '[ INFO ] ... the results of the example script are saved in "{0}".\n'.format( working_dir ) ) cpus_2_use = 4 if multiprocessing.cpu_count() < cpus_2_use: print( '[ INFO ] 4 threads are not available -> re-sampling is performed with only {0} thread(s) (this increases calculation time approx. proportional).'.format( multiprocessing.cpu_count() ) ) cpus_2_use = multiprocessing.cpu_count() cluster = pyGCluster.Cluster( data = data, working_directory = working_dir, verbosity_level = 2 ) print( "[ INFO ] pyGCluster will evoke 4 threads (if possible), which each require approx. 1.5GB RAM. Please make sure you have enough RAM available (4 threads in all require approx. 6GB RAM)." ) print( "[ INFO ] It will take approx. 2 hours to complete 250,000 iterations on 4 threads." ) cluster.do_it_all( distances = [ 'euclidean', 'correlation' ], linkages = [ 'complete', 'average', 'ward' ], iter_max = 10000, cpus_2_use = cpus_2_use, min_value_4_expression_map = -3, max_value_4_expression_map = 3, threshold_4_the_lowest_max_freq = 0.005 )
'fastaID3' : {'1':(3.0,0.2),'2':(-3.0,0.4),'3':(-2.0,0.4),'4':(3.0,0.3),'5':(-2.0,0.3)}, 'fastaID4' : {'1':(3.0,0.4),'2':(-3.0,0.4),'3':(-1.0,0.4),'4':(4.0,0.4),'5':(-2.5,0.4)}, 'fastaID5' : {'1':(3.0,1.0),'2':(-3.0,0.4),'3':( 0.0,0.4),'4':(0.0,0.1),'5':(-3.5,0.4)}, 'fastaID6' : {'1':(3.0,2.0),'2':(-3.0,0.4),'3':( 1.0,0.4),'4':(1.5,0.2),'5':(-4.0,0.4)}, 'fastaID7' : {'1':(3.0,1.3),'2':(-3.0,0.4),'3':( 2.0,0.4),'4':(2.5,0.3),'5':(-4.0,0.5)}, } if __name__ == '__main__': # print( __doc__ ) working_dir = './simpleExpressionMaps/' if not os.path.exists( working_dir ): os.mkdir( working_dir ) print( '[ INFO ] ... the results of the example script are saved in "{0}"'.format( working_dir ) ) cluster = pyGCluster.Cluster() for hm in cluster['Heat map']['Color Gradients'].keys(): cluster.draw_expression_map( data = data, # additional_labels = None, min_value_4_expression_map = -4, max_value_4_expression_map = +4, expression_map_filename = os.path.join( working_dir , 'simpleExpressionMap_{0}.svg'.format( hm )), legend_filename = os.path.join( working_dir , 'legend_hm_{0}.svg'.format( hm )), color_gradient = hm, box_style = 'classic' ) for bs in cluster['Heat map']['SVG box styles'].keys(): cluster.draw_expression_map( data = data, # additional_labels = None,
def main(input_file=None, output_file=None, params=None): ''' Arguments: input_file (str): input filename of csv which should be unified output_file (str): output filename of csv after unifying params (dict): params as passed by ursgal Please visit pyGCluster documentation for more information on this plotting function:: * http://pygcluster.github.io/usage.html#clustered-data-visualization * color gradients * box styles Available parameters for heatmap:: * heatmap_identifier_field_name defines the fieldname in the csv to appear directly right of the heatmap rows. Tipically the gene or protein name (Default: 'Protein') * heatmap_annotation_field_name defines the fieldname for an additional annotation in the csv to appear directly right of the object name in the heatmap rows. Tipically the gene or protein name (Default : 'map to uniprot') * heatmap_max_value defines the maximum value for the color scale (Default: 3) * heatmap_min_value defines the maximum value for the color scale (Default: -3) * heatmap_color_gradient defines the color gradient for the visualization (Default: 'Spectral') * heatmap_box_style defines the box style for the visualization (Default: 'classic') * heatmap_value_suffix is the suffix of the column name for columns holding a value (Default: '_mean') * heatmap_error_suffix is the suffix of the column name for columns holding the error to the value (Default: '_std') * heatmap_column_order defines the order of the columns for plotting Please do not forget to cite pyGCluster and Ursgal when using this node. ''' csv_reader = csv.DictReader(open(input_file, 'r')) params['additional_labels'] = {} if params['heatmap_column_order'] == []: params['all_conditions'] = set() for fieldname in csv_reader.fieldnames: if fieldname.endswith(params['heatmap_value_suffix']): params['all_conditions'].add( fieldname.replace(params['heatmap_value_suffix'], '') # this tag could also go into params ) params['all_conditions'] = sorted(list(params['all_conditions'])) else: params['all_conditions'] = params['heatmap_column_order'] plot_collector = {} identifiers = [] forbidden_character_list = ['>', '<'] for line_dict in csv_reader: line_name = line_dict[params['heatmap_identifier_field_name']] for character in forbidden_character_list: line_name = line_name.replace(character, '__') plot_collector[line_name] = {} for condition in params['all_conditions']: try: ratio = float(line_dict['{0}{1}'.format( condition, params['heatmap_value_suffix'])]) sd = float(line_dict['{0}{1}'.format( condition, params['heatmap_error_suffix'])]) plot_collector[line_name][condition] = (ratio, sd) except: continue identifiers.append(line_name) try: params['additional_labels'][line_name] = [ ' ', line_dict[params['heatmap_annotation_field_name']] ] except: pass cluster = pyGCluster.Cluster() folder = os.path.dirname(output_file) cluster['Working directory'] = folder cluster.draw_expression_map( data=plot_collector, identifiers=identifiers, conditions=params['all_conditions'], additional_labels=params['additional_labels'], min_value_4_expression_map=params['heatmap_min_value'], max_value_4_expression_map=params['heatmap_max_value'], expression_map_filename=output_file, box_style=params['heatmap_box_style'], color_gradient=params['heatmap_color_gradient'], ) return
def main(input_file=None, output_file=None, params=None): """ Arguments: input_file (str): input filename of csv which should be unified output_file (str): output filename of csv after unifying params (dict): params as passed by ursgal Note: Please do not forget to cite pyGCluster AND Ursgal when using this node. Thank you in advance! Please visit pyGCluster documentation for more information on this plotting function:: * http://pygcluster.github.io/usage.html#clustered-data-visualization * color gradients * box styles Available parameters for heatmap:: * heatmap_identifier_field_name defines the fieldname in the csv to appear directly right of the heatmap rows. Tipically the gene or protein name (Default: 'Protein') * heatmap_annotation_field_name defines the fieldname for an additional annotation in the csv to appear directly right of the object name in the heatmap rows. Tipically the gene or protein name (Default : 'map to uniprot') * heatmap_max_value defines the maximum value for the color scale (Default: 3) * heatmap_min_value defines the maximum value for the color scale (Default: -3) * heatmap_color_gradient defines the color gradient for the visualization (Default: 'Spectral') * heatmap_box_style defines the box style for the visualization (Default: 'classic') * heatmap_value_suffix is the suffix of the column name for columns holding a value (Default: '_mean') * heatmap_error_suffix is the suffix of the column name for columns holding the error to the value (Default: '_std') * heatmap_column_positions defines the order of the columns for plotting Note: Use of force = True is recommended to cover changes in the csv input file. Default value suffix of the column name is '_mean' and '_std' for the error estimate. Please refer to the documentation for further details on parameters. """ csv_reader = csv.DictReader(open(input_file, "r")) # pprint.pprint(params) params["additional_labels"] = {} if params["heatmap_column_positions"] == {}: params["all_conditions"] = set() for fieldname in csv_reader.fieldnames: if fieldname.endswith(params["heatmap_value_suffix"]): params["all_conditions"].add( fieldname.replace( params["heatmap_value_suffix"], "" ) # this tag could also go into params ) params["all_conditions"] = sorted(list(params["all_conditions"])) else: params["all_conditions"] = [ params["heatmap_column_positions"][k] for k in sorted(params["heatmap_column_positions"].keys()) ] plot_collector = {} identifiers = [] forbidden_character_list = [">", "<"] for line_dict in csv_reader: line_name = line_dict[params["heatmap_identifier_field_name"]] for character in forbidden_character_list: line_name = line_name.replace(character, "__") plot_collector[line_name] = {} for condition in params["all_conditions"]: try: ratio = float( line_dict[ "{0}{1}".format(condition, params["heatmap_value_suffix"]) ] ) sd = float( line_dict[ "{0}{1}".format(condition, params["heatmap_error_suffix"]) ] ) plot_collector[line_name][condition] = (ratio, sd) except: continue identifiers.append(line_name) if params["heatmap_annotation_field_name"] in line_dict.keys(): annotation = line_dict[params["heatmap_annotation_field_name"]] for character in forbidden_character_list: annotation = annotation.replace(character, "__") params["additional_labels"][line_name] = [" ", annotation] cluster = pyGCluster.Cluster() folder = os.path.dirname(output_file) cluster["Working directory"] = folder cluster.draw_expression_map( data=plot_collector, identifiers=identifiers, conditions=params["all_conditions"], additional_labels=params["additional_labels"], min_value_4_expression_map=params["heatmap_min_value"], max_value_4_expression_map=params["heatmap_max_value"], expression_map_filename=output_file, box_style=params["heatmap_box_style"], color_gradient=params["heatmap_color_gradient"], ) return
def main(): pyGCluster_dir = os.path.split(sys.argv[0])[0] ## parse data data = dict() with open(sys.argv[1]) as fin: reader = csv.DictReader(fin, delimiter=',') conditions = set() for row in reader: if not conditions: conditions = set([_.split('__')[0] for _ in row.keys()]) - set(['identifier']) data[row['identifier']] = dict() for condition in conditions: mean = float(row['{0}__MEAN'.format(condition)]) std = float(row['{0}__STD'.format(condition)]) data[row['identifier']][condition] = (mean, std) working_dir = os.path.join(pyGCluster_dir, 'hoehner_example_run/') print( '[ INFO ] ... the results of the example script are saved in "{0}".\n'. format(working_dir)) ## initialize pyGCluster if not os.path.exists(working_dir): os.mkdir(working_dir) TestCluster = pyGCluster.Cluster(data=data, working_directory=os.path.join( pyGCluster_dir, 'hoehner_example_run/'), verbosity_level=2) print( "[ INFO ] pyGCluster will evoke 4 threads (if possible), which each require approx. 1.5GB RAM. Please make sure you have enough RAM available (4 threads in all require approx. 6GB RAM)." ) print( "[ INFO ] It will take approx. 2 hours to complete 250,000 iterations on 4 threads." ) ## start the re-sampling process ... if 4 threads are available, this may take X hours and Y GB RAM. distance_metrices = ['correlation', 'euclidean'] linkage_methods = ['complete', 'average', 'ward'] print('[ INFO ] performing re-sampling ...') cpus_2_use = 4 if multiprocessing.cpu_count() < cpus_2_use: print( '[ INFO ] 4 threads are not available -> re-sampling is performed with only {0} thread(s) (this increases calculation time approx. proportional).' .format(multiprocessing.cpu_count())) cpus_2_use = multiprocessing.cpu_count() print() TestCluster.resample(distances=distance_metrices, linkages=linkage_methods, iter_max=250000, pickle_filename='example.pkl', cpus_2_use=cpus_2_use) # after re-sampling, the results are saved in a file given by "pickle_filename" ## plot a heat map showing the frequencies among the distance-linkage combinations (DLCs) of the first 33 clusters: TestCluster.plot_clusterfreqs(min_cluster_size=4, top_X_clusters=33) ## create and plot communities TestCluster.build_nodemap( min_cluster_size=4, threshold_4_the_lowest_max_freq=0.005 ) # create communities from a set of the 1 promille (or more) clusters TestCluster.write_dot( filename='hoehner_1promilleclusters_minsize4.dot', min_value_4_expression_map=-3, max_value_4_expression_map=3, color_gradient='1337' ) # creates DOT file of the node map showing the cluster composition of the communities TestCluster.draw_community_expression_maps( min_value_4_expression_map=-3, max_value_4_expression_map=3, color_gradient='1337' ) # draw a heat map showing the protein composition of each community TestCluster.draw_expression_profiles( min_value_4_expression_map=-3, max_value_4_expression_map=3 ) # draw a plot showing the expression patterns of the proteins (with standard deviation) inside each community ## save to be able to continue analysis at a later timepoint TestCluster.save(filename='example_1promille_communities.pkl') #TestCluster.load( 'example_1percent_communities.pkl' ) # create CSV containing the protein composition of communities # => two cols: community ID -> identifier with open( os.path.join(TestCluster['Working directory'], 'community2protein.csv'), 'w') as fout: writer = csv.DictWriter(fout, fieldnames=['community ID', 'identifier']) writer.writeheader() _max_level = max( [_communityID[1] for _communityID in TestCluster['Communities']]) for cluster in TestCluster._get_levelX_clusters(level=_max_level): _communityID = (cluster, _max_level) for protein_index in TestCluster['Communities'][_communityID][ 'index 2 obCoFreq dict']: protein = TestCluster['Identifiers'][protein_index] name = '{0}-{1}'.format( TestCluster['Communities'][_communityID]['cluster ID'], _max_level) writer.writerow({'community ID': name, 'identifier': protein}) print('[ INFO ] test script successfully executed.')