def corpus_properties(dataset, context): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset+'_text' (documents, labels) = data.read_files(corpus_path) props = {} #~ giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i%10==0: print ' ',str(i)+'/'+str(len(documents)) g = graph_representation.construct_cooccurrence_network(text,context=context) #~ giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k,v in p.iteritems(): if i==0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: print ' ',key props_total[key+'_mean'] = numpy.mean(props[key]) props_total[key+'_std'] = numpy.std(props[key]) data_name = dataset.replace('/','.') #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name) data.pickle_to_file(props, 'output/properties/cooccurrence/stats_'+data_name) data.pickle_to_file(props_total, 'output/properties/cooccurrence/stats_tot_'+data_name)
def corpus_dependency_properties(dataset = 'air/problem_descriptions'): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset+'_dependencies' (documents, labels) = data.read_files(corpus_path) props = {} giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i%10==0: print ' ',str(i)+'/'+str(len(documents)) g = graph_representation.construct_dependency_network(text,remove_stop_words=True) giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k,v in p.iteritems(): if i==0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: props_total[key+'_mean'] = numpy.mean(props[key]) props_total[key+'_std'] = numpy.std(props[key]) data.pickle_to_file(giant, 'output/properties/dependency/corpus_network_air_all_no_stop_words') data.pickle_to_file(props, 'output/properties/dependency/docs_air_all_no_stop_words') data.pickle_to_file(props_total, 'output/properties/dependency/docs_air_all_no_stop_words_total')
def compare_stats_to_random(dataset): dataset = dataset.replace('/','.') stats = data.pickle_from_file('output/properties/cooccurrence/stats_tot_'+dataset) n = stats['# nodes_mean'] p = stats['mean degree_mean']/(2*n) g = nx.directed_gnp_random_graph(int(n), p) props = graph.network_properties(g) pp.pprint(props)
def compare_stats_to_random(dataset): dataset = dataset.replace('/', '.') stats = data.pickle_from_file('output/properties/cooccurrence/stats_tot_' + dataset) n = stats['# nodes_mean'] p = stats['mean degree_mean'] / (2 * n) g = nx.directed_gnp_random_graph(int(n), p) props = graph.network_properties(g) pp.pprint(props)
def corpus_dependency_properties(dataset='air/problem_descriptions'): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset + '_dependencies' (documents, labels) = data.read_files(corpus_path) props = {} giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i % 10 == 0: print ' ', str(i) + '/' + str(len(documents)) g = graph_representation.construct_dependency_network( text, remove_stop_words=True) giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k, v in p.iteritems(): if i == 0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: props_total[key + '_mean'] = numpy.mean(props[key]) props_total[key + '_std'] = numpy.std(props[key]) data.pickle_to_file( giant, 'output/properties/dependency/corpus_network_air_all_no_stop_words') data.pickle_to_file( props, 'output/properties/dependency/docs_air_all_no_stop_words') data.pickle_to_file( props_total, 'output/properties/dependency/docs_air_all_no_stop_words_total')
def corpus_properties(dataset, context): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset + '_text' (documents, labels) = data.read_files(corpus_path) props = {} #~ giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i % 10 == 0: print ' ', str(i) + '/' + str(len(documents)) g = graph_representation.construct_cooccurrence_network( text, context=context) #~ giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k, v in p.iteritems(): if i == 0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: print ' ', key props_total[key + '_mean'] = numpy.mean(props[key]) props_total[key + '_std'] = numpy.std(props[key]) data_name = dataset.replace('/', '.') #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name) data.pickle_to_file(props, 'output/properties/cooccurrence/stats_' + data_name) data.pickle_to_file( props_total, 'output/properties/cooccurrence/stats_tot_' + data_name)