コード例 #1
0
def corpus_properties(dataset, context):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_text'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    #~ giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_cooccurrence_network(text,context=context)
        #~ giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k,v in p.iteritems():
            if i==0: props[k] = []
            props[k].append(v)
        g = None # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        print '   ',key
        props_total[key+'_mean'] = numpy.mean(props[key])
        props_total[key+'_std'] = numpy.std(props[key])

    data_name = dataset.replace('/','.')
    #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name)
    data.pickle_to_file(props, 'output/properties/cooccurrence/stats_'+data_name)
    data.pickle_to_file(props_total, 'output/properties/cooccurrence/stats_tot_'+data_name)
コード例 #2
0
def corpus_dependency_properties(dataset = 'air/problem_descriptions'):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_dependencies'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_dependency_network(text,remove_stop_words=True)
        giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k,v in p.iteritems():
            if i==0: props[k] = []
            props[k].append(v)
        g = None # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        props_total[key+'_mean'] = numpy.mean(props[key])
        props_total[key+'_std'] = numpy.std(props[key])

    data.pickle_to_file(giant, 'output/properties/dependency/corpus_network_air_all_no_stop_words')
    data.pickle_to_file(props, 'output/properties/dependency/docs_air_all_no_stop_words')
    data.pickle_to_file(props_total, 'output/properties/dependency/docs_air_all_no_stop_words_total')
コード例 #3
0
def compare_stats_to_random(dataset):
    dataset = dataset.replace('/','.')
    stats = data.pickle_from_file('output/properties/cooccurrence/stats_tot_'+dataset)
    n = stats['# nodes_mean']
    p = stats['mean degree_mean']/(2*n)
    g = nx.directed_gnp_random_graph(int(n), p)
    props = graph.network_properties(g)
    pp.pprint(props)
コード例 #4
0
def compare_stats_to_random(dataset):
    dataset = dataset.replace('/', '.')
    stats = data.pickle_from_file('output/properties/cooccurrence/stats_tot_' +
                                  dataset)
    n = stats['# nodes_mean']
    p = stats['mean degree_mean'] / (2 * n)
    g = nx.directed_gnp_random_graph(int(n), p)
    props = graph.network_properties(g)
    pp.pprint(props)
コード例 #5
0
def corpus_dependency_properties(dataset='air/problem_descriptions'):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset + '_dependencies'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(documents))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=True)
        giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k, v in p.iteritems():
            if i == 0: props[k] = []
            props[k].append(v)
        g = None  # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        props_total[key + '_mean'] = numpy.mean(props[key])
        props_total[key + '_std'] = numpy.std(props[key])

    data.pickle_to_file(
        giant,
        'output/properties/dependency/corpus_network_air_all_no_stop_words')
    data.pickle_to_file(
        props, 'output/properties/dependency/docs_air_all_no_stop_words')
    data.pickle_to_file(
        props_total,
        'output/properties/dependency/docs_air_all_no_stop_words_total')
コード例 #6
0
def corpus_properties(dataset, context):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset + '_text'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    #~ giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(documents))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        #~ giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k, v in p.iteritems():
            if i == 0: props[k] = []
            props[k].append(v)
        g = None  # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        print '   ', key
        props_total[key + '_mean'] = numpy.mean(props[key])
        props_total[key + '_std'] = numpy.std(props[key])

    data_name = dataset.replace('/', '.')
    #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name)
    data.pickle_to_file(props,
                        'output/properties/cooccurrence/stats_' + data_name)
    data.pickle_to_file(
        props_total, 'output/properties/cooccurrence/stats_tot_' + data_name)