/
calculate-jaccard-similarity.py
97 lines (74 loc) · 3.17 KB
/
calculate-jaccard-similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/local/bin/python
import os
import json
import re
import cPickle
import numpy as np
import utils as tech
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['text.usetex'] = True
READ = 'r'
WRITE = 'w'
base = os.path.join(os.getcwd(),'data')
sources = ['harrison','wikipedia','braunwald','twitter']
keywords = [word.replace(' ','_') for word in open('keywords').read().splitlines()]
CORPUS_FILENAME = 'corpus.json'
'''
Data files structured as:
./data/source/keyword
'''
#---LOAD DATA
if not os.path.isfile(CORPUS_FILENAME):
#More expressive than itertools.product, small loops --> no important speed or memory difference
corpus = {}
for source in sources:
corpus[source] = {}
for disease in keywords:
path = os.path.join(base,source,disease)
text = ' '.join(open(os.path.join(path,filename),READ).read() for filename in os.listdir(path)
if not os.path.isdir(os.path.join(path,filename)))
text = text.replace('.',' ').replace("\n"," ")
text = re.sub(r"[^\x00-\x7F]","",text) #Regexp faster than iterating through string to remove non-ASCII
corpus[source][disease] = list(tech.cleanse(text))
#Cleanse returns type set. Type set is not JSON serializable. Type list is.
json.dump(corpus,open(CORPUS_FILENAME,WRITE))
else:
corpus = json.load(open(CORPUS_FILENAME,READ))
#--- CALCULATE JACCARD SIMILARITY
source_rubric = [[source for source in sources]
for source in sources]
filenames = ['jaccard-similarity-%s'%disease for disease in keywords]
filenames += ['jaccard-similarities.json']
if not all([os.path.isfile(filename) for filename in filenames]):
jaccard_matrices = {disease:np.zeros((len(sources),len(sources))) for disease in keywords}
for disease in keywords:
jaccard_matrices[disease] = np.array([[tech.jaccard(corpus[sources[i]][disease],corpus[sources[j]][disease])
for i in xrange(len(sources))]
for j in xrange(len(sources))])
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.imshow(jaccard_matrices[disease],interpolation='nearest',aspect='equal',vmin=0,vmax=1)
ax.set_xticks(range(len(sources)))
ax.set_yticks(range(len(sources)))
ax.set_xticklabels(map(tech.format,sources))
ax.set_yticklabels(map(tech.format,sources))
cbar = plt.colorbar(cax)
cbar.set_label(tech.format('Jaccard Similarity'))
fig.tight_layout()
plt.savefig('jaccard-similarity-%s-w-twitter'%disease)
cPickle.dump(jaccard_matrices,open('jaccard-similarities.json',WRITE))
#--- BOOTSTRAPPING
lens = [len(corpus[source][disease]) for source in sources] #Does order matter?
amalgamated_corpus = ' '.join(' '.join(corpus[source][disease]) for disease in keywords for source in sources)
#N.B. Don't depucliated -- must preserve original word frequencies for resampling
jaccard_distributions = tech.resample(amalgamated_corpus,n_partitions=len(lens),partition_sizes=lens,repetitions=10000,
monitor=True,save=True)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(jaccard_distributions,color='k')
tech.adjust_spines(ax)
ax.set_xlabel(tech.format('Jaccard Similarity'))
ax.set_ylabel(tech.format('No. of occurence'))
plt.tight_layout()
plt.savefig('distribution-jaccard-similarities-w-twitter.tiff')