forked from kvalle/TextNet
/
graph_representation.py
441 lines (382 loc) · 16.5 KB
/
graph_representation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
"""Construct graph representations from text.
The module contains functions from creating networks based on text documents,
and for converting the networks into feature-vectors.
Feature vectors are created based on node centrality in the text networks.
The following text representations are supported:
:random:
Will create a network with all distinct terms in the provided document
as nodes. Edges are created at random between the nodes, based on provided
probabilities.
:co-occurrence:
Distinct terms in the document are used as nodes. Edges are created
between any terms that occurs closely together in the text.
:dependency:
Words as nodes. Edges represent dependencies extracted from the text
using the stanford dependency parser (see the 'stanford_parser' module).
The module makes heavy use of the :mod:`graph` module.
:Author: Kjetil Valle <kjetilva@stud.ntnu.no>"""
from random import random as rand
import numpy as np
import networkx as nx
import pickle
from scipy import sparse
import os.path
import math
import graph
import preprocess
import util
import data
######
##
## Create graph representations
##
######
def create_graphs(documents, graph_type='co-occurrence', verbose=False):
"""Crate text networks of given type, using their default paramters"""
graphs = []
for i, text in enumerate(documents):
if verbose and i%100==0:
print ' ',i,'of',len(documents)
if graph_type=='co-occurrence':
g = construct_cooccurrence_network(text)
elif graph_type=='dependency':
g = construct_dependency_network(text)
elif graph_type=='random':
g = construct_random_network(text)
else:
raise Exception("unreccognized graph type: "+graph_type)
graphs.append(g)
return graphs
def _cooccurrence_preprocessing(doc, context, already_preprocessed):
"""Preprocess document as needed for co-occurrence network creation"""
if context=='window':
if already_preprocessed:
doc = doc.split(' ')
else:
doc = preprocess.preprocess_text(doc)
elif context=='sentence':
doc = preprocess.tokenize_sentences(doc)
for i, sentence in enumerate(doc):
sentence = preprocess.preprocess_text(sentence)
doc[i] = sentence
return doc
def _window_cooccurrence_matrix(doc, direction='undirected', window_size=2, verbose=False):
"""Create co-occurrence matrix for *doc* using context window"""
term_list = np.array(list(set(doc)))
A = sparse.lil_matrix( (len(term_list), len(term_list)) )
for i, word in enumerate(doc):
if verbose:
if i < 10 or i%1000==0: print ' word ',str(i)+' of '+str(len(doc))
context = doc[i+1:i+1+window_size]
for context_word in context:
if word == context_word: continue
x = np.where(term_list==word)[0][0]
y = np.where(term_list==context_word)[0][0]
if direction == 'forward' or direction == 'undirected':
A[x,y] += 1
if direction == 'backward' or direction == 'undirected':
A[y,x] += 1
return A, term_list
def _sentence_cooccurrence_matrix(doc, direction='undirected', verbose=False):
"""Create co-occurrence matrix for *doc* using sentence contexts"""
term_list = np.array(list(set(util.flatten(doc))))
A = sparse.lil_matrix( (len(term_list), len(term_list)) )
for i, sentence in enumerate(doc):
if verbose:
if i < 10 or i%1000==0: print ' sentence ',str(i)+' of '+str(len(doc))
for w, word in enumerate(sentence):
for c, context_word in enumerate(sentence):
if word == context_word: continue
if direction=='forward' and w > c: continue
if direction=='backward' and w < c: continue
x = np.where(term_list==word)[0][0]
y = np.where(term_list==context_word)[0][0]
A[x,y] += 1
return A, term_list
def construct_cooccurrence_network(doc, window_size=2, direction='undirected', context='sentence', already_preprocessed=False, orders=[], order_weights=[1.0,1.0,1.0],doc_id=None,verbose=False):
"""Construct co-occurrence network from text.
*direction* must be 'forward', 'backward' or 'undirected', while *context*
can be 'window' or 'sentence'.
If *context* is 'window', *already_preprocessed* indicate whether *doc*
already have been processed. Sentence contexts require unpreocessed *doc*s.
Any value for *window_size* is ignored if *context* is 'sentence'.
A DiGraph is created regardless of direction parameter, but with 'undirected',
edges are created in both directions.
"""
doc = _cooccurrence_preprocessing(doc, context, already_preprocessed)
if context is 'sentence':
matrix, term_list = _sentence_cooccurrence_matrix(doc, direction, verbose)
elif context is 'window':
matrix, term_list = _window_cooccurrence_matrix(doc, direction, window_size, verbose)
g = nx.DiGraph()
g.add_nodes_from(term_list)
if len(orders)==0:
graph.add_edges_from_matrix(g, matrix, term_list)
else:
if doc_id is not None and os.path.exists(doc_id):
first, second, third = data.pickle_from_file(doc_id)
else:
first, second, third = _higher_order_matrix(matrix.todense())
if doc_id is not None:
data.pickle_to_file((first,second,third), doc_id)
if 1 in orders:
graph.add_edges_from_matrix(g, first, term_list, rel_weight=order_weights[0])
if 2 in orders:
graph.add_edges_from_matrix(g, second, term_list, rel_weight=order_weights[1])
if 3 in orders:
graph.add_edges_from_matrix(g, third, term_list, rel_weight=order_weights[2])
return g
def _higher_order_matrix(matrix, do_clip=False):
"""Construct higher order matrix from first order *matrix*.
Values of *matrix* are clipped to range [0,1] depending on *do_clip*"""
dim = matrix.shape[0]
if do_clip: matrix = np.clip(matrix, 0, 1)
first_order = util.fill_matrix_diagonal(matrix, 0.0)
second_order = first_order**2
third_order = first_order**3
col_sums = np.sum(first_order, axis=0)
discount_matrix = np.matrix(np.zeros((dim,dim), np.int8))
for i in range(dim):
for j in range(dim):
d = col_sums[0,i] + col_sums[0,j] - 1
if i!=j:
discount_matrix[i,j] = d
third_order = third_order - np.multiply(discount_matrix,first_order)
return first_order, second_order, third_order
def construct_random_network(doc, p=0.2):
"""Construct random network for use as baseline.
Create a random network based on *doc*, with words used for nodes.
Edges are created between any given pair of nodes (a,b) with probability *p*.
All edges will have weight = 1.0
"""
doc = preprocess.preprocess_text(doc)
words = list(set(doc)) # list of unique words
# create graph
graph = nx.DiGraph()
graph.add_nodes_from(words)
# add edges
for word_a in graph.nodes():
for word_b in graph.nodes():
if word_a != word_b and rand() < p:
_update_edge_weight(graph, word_a, word_b)
return graph
def construct_dependency_network(doc, weighted=False, direction='undirected',remove_stop_words=False, exclude=['agent', 'advcl','parataxis'],verbose=False, unpickle=True):
"""Construct a dependency network from *doc*.
Creates a network form *doc* with distinct word used for nodes, and
all dependency types defined by the stanford parser, except those listed
in *exclude* used as edges.
*direction* must be 'undirected', 'forward' or 'backward.
Forward direction means head-dependent, while backward gives dependent-head relations.
"""
graph = nx.DiGraph()
if unpickle:
deps = pickle.loads(doc)
doc = None
else:
deps = doc
for dep_type, dep in deps.iteritems():
if verbose: print ' dep:',dep_type
if dep_type in exclude:
continue
for tup in dep:
g = tup[0][0]
d = tup[1][0]
g = preprocess.preprocess_token(g, do_stop_word_removal=remove_stop_words)
d = preprocess.preprocess_token(d, do_stop_word_removal=remove_stop_words)
if g is not None and d is not None:
if direction=='forward':
_update_edge_weight(graph, g, d,labels=[dep_type],inc_weight=weighted)
else:
_update_edge_weight(graph, d, g,labels=[dep_type],inc_weight=weighted)
if direction=='undirected':
graph = graph.to_undirected().to_directed()
return graph
def _update_edge_weight(graph, node1, node2,labels=[],inc_weight=True):
"""Update or create weighed edge between two nodes"""
if graph.has_edge(node1, node2):
graph[node1][node2]['label'] += labels
if inc_weight:
graph[node1][node2]['weight'] += 1.0
else:
graph.add_edge(node1, node2, weight=1.0,label=labels)
def similarity_matrix_to_graph(distM):
"""Converts similarity matrix to weighted graph.
:Author: Gleb Sizov <sizov@idi.ntnu.no>
"""
lenM = distM
weightM = 1 / lenM
G = nx.Graph()
for node1 in range(len(distM)):
G.add_node(node1)
for node2 in range(node1 + 1, len(distM)):
if distM[node1, node2] > 0:
G.add_edge(node1, node2, len=lenM[node1, node2], weight=weightM[node1, node2])
return G
######
##
## Create vector representations
##
######
def graphs_to_vectors(graphs, metric, verbose=False):
""" Create centrality based feature-vectors from graph representations
Takes a list of graphs and returns a numpy nd-matix of feature vectors,
based on the provides *metric*.
"""
all_tokens = graph.node_set(graphs)
features = np.zeros((len(all_tokens), len(graphs)))
for i, g in enumerate(graphs):
if verbose and i%50==0: print str(i)+'/'+str(len(graphs))
features[:,i] = graph_to_vector(g, metric, all_tokens)
return features
def graph_to_vector(g, metric, all_tokens):
"""Create feature vector from a single graph.
The list of *all_tokens* is used as basis for the feature vector, and
value for each word in graph *g* according to *metric* is calculated.
"""
cents = graph.centralities(g, metric)
vector = [cents.get(token, 0.0) for token in all_tokens]
return vector
def graph_to_dict(g, metric, icc=None):
"""Return node values as dictionary
If `icc` is provided, values are TC-ICC, otherwise TC is calculated.
"""
import pprint as pp
centralities = graph.centralities(g, metric)
if icc:
for term in centralities:
try:
centralities[term] = centralities[term] * icc[term]
except KeyError as ke:
# excepting for this to detect possible missmatch between icc and doc network
# TODO: should be cleaned up once tc-icc eval exp is done
if term not in centralities and term in icc:
print ' !', str(ke), 'found in icc but not in graph centralities'
else:
print ' !', str(ke), 'found in graph centralities but not in icc'
centralities[term] = 0.0
return centralities
def dicts_to_vectors(dicts, explicit_keys=None, remove_stop_words=False):
"""Convert a list of dictionaries to feature-vectors"""
if not explicit_keys:
node_set = set()
for d in dicts:
for node in d.keys():
node_set.add(node)
all_tokens = list(node_set)
else:
all_tokens = explicit_keys
if remove_stop_words:
all_tokens = preprocess.remove_stop_words(all_tokens)
features = np.zeros((len(all_tokens), len(dicts)))
for i, d in enumerate(dicts):
if i%100==0: print ' vector',str(i)+'/'+str(len(dicts))
features[:,i] = [d.get(token, 0.0) for token in all_tokens]
return features
def calculate_icc_dict(centralities):
icc = {}
for term in centralities:
icc[term] = 1.0/(1.0 + centralities[term])
return icc
######
##
## Term weighting metrics
##
######
from graph import GraphMetrics
def get_metrics(weighted=None, exclude_flow=False):
"""Return list of graph node evaluation metrics.
If *weighted* is not specified, or `None`, all metrics are returned.
Otherwise metrics suited for (un)*weighted* networks are returned."""
metrics = None
if weighted is None:
metrics = graph.mapping.keys()
elif weighted:
metrics = [GraphMetrics.WEIGHTED_DEGREE, GraphMetrics.WEIGHTED_IN_DEGREE, GraphMetrics.WEIGHTED_OUT_DEGREE,
GraphMetrics.WEIGHTED_CLOSENESS, GraphMetrics.CURRENT_FLOW_CLOSENESS,
GraphMetrics.WEIGHTED_BETWEENNESS, GraphMetrics.CURRENT_FLOW_BETWEENNESS, GraphMetrics.WEIGHTED_LOAD,
GraphMetrics.EIGENVECTOR, GraphMetrics.PAGERANK, GraphMetrics.HITS_HUBS, GraphMetrics.HITS_AUTHORITIES]
else:
metrics = [GraphMetrics.DEGREE, GraphMetrics.IN_DEGREE, GraphMetrics.OUT_DEGREE,
GraphMetrics.CLOSENESS, GraphMetrics.CURRENT_FLOW_CLOSENESS,
GraphMetrics.BETWEENNESS, GraphMetrics.CURRENT_FLOW_BETWEENNESS, GraphMetrics.LOAD,
GraphMetrics.EIGENVECTOR, GraphMetrics.PAGERANK, GraphMetrics.HITS_HUBS, GraphMetrics.HITS_AUTHORITIES]
if exclude_flow:
metrics.remove(GraphMetrics.CURRENT_FLOW_BETWEENNESS)
metrics.remove(GraphMetrics.CURRENT_FLOW_CLOSENESS)
return metrics
######
def test_dependency_graph():
(docs, labels) = data.read_files('../data/tasa/TASA900_dependencies')
graphs = []
for i, text in enumerate(docs):
print i
graphs.append(construct_dependency_network(text))
g = graphs[0]
print g.nodes()
print g.edges()
print '#graphs:', len(graphs)
pos = nx.spring_layout(g)
graph.draw_with_centrality(g, layout=pos)
def test_graph_to_dict():
import pprint as pp
g = nx.DiGraph()
g.add_nodes_from(range(0,7))
edge_list = [(0,1),(0,6),(0,5),(1,2),(1,6),(2,0),(2,1),(2,3),(3,4),(4,5),(4,6),(5,0),(5,3),(5,4)]
g.add_edges_from(edge_list)
pp.pprint(graph_to_dict(g,'PageRank'))
##
# TESTS
##
def test_co_occurrences():
doc1 = data.read_file('../data/tasa/TASATest/Science/Agatha09.07.03.txt')
doc2 = data.read_file('../data/tasa/TASATest_preprocessed/Science/Agatha09.07.03.txt')
g0 = construct_cooccurrence_network(doc1, context='window', already_preprocessed=False)
g1 = construct_cooccurrence_network(doc2, context='window', already_preprocessed=True)
g2 = construct_cooccurrence_network(doc1, context='sentence', already_preprocessed=False)
graphs = data.pickle_from_file('output/testdata/co-occurrence-graphs.pkl')
assert(graph.equal(g0,graphs[0]))
assert(graph.equal(g1,graphs[1]))
assert(graph.equal(g2,graphs[2]))
doc = data.read_file('output/testdata/higher.order.testdoc.preprocessed.txt')
g1 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1])
g12 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,2])
g123 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,2,3])
g13 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,3])
assert(('foo','bar') in g1.edges())
assert(('foo','baz') not in g1.edges())
assert(('foo','cake') not in g1.edges())
assert(('foo','bar') in g12.edges())
assert(('foo','baz') in g12.edges())
assert(('foo','cake') not in g12.edges())
assert(('foo','bar') in g123.edges())
assert(('foo','baz') in g123.edges())
assert(('foo','cake') in g123.edges())
assert(('foo','baz') not in g13.edges())
print 'ok'
def test_higher_order():
matrix = np.matrix([[3,1,1,2],
[1,2,0,2],
[2,0,2,1],
[2,2,1,3]])
first, second, third = _higher_order_matrix(matrix)
ref_1 = np.matrix([[0,1,1,1],
[1,0,0,1],
[1,0,0,1],
[1,1,1,0]])
assert(np.equal(first,ref_1).all())
ref_2 = np.matrix([[3,1,1,2],
[1,2,2,1],
[1,2,2,1],
[2,1,1,3]])
assert(np.equal(second,ref_2).all())
ref_3 = np.matrix([[4,1,1,0],
[1,2,2,1],
[1,2,2,1],
[0,1,1,4]])
assert(np.equal(third,ref_3).all())
print 'ok'
def run_tests():
test_co_occurrences()
test_higher_order()
if __name__=="__main__":
run_tests()