forked from benjaminaschultz/shakespeare
/
shakespeare.py
executable file
·266 lines (217 loc) · 11.3 KB
/
shakespeare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/env python
import re, glob, sys,os
import argparse
import itertools as it
import numpy as np
import scipy.sparse
import cPickle as pickle
from sklearn.naive_bayes import MultinomialNB
from content_sources import arxiv, bibtex, rss
#remove punctuation and prepositions from a string
def find_keywords(text):
keywords=re.sub('[{}:?!@#$%^&*\(\)_.\\/,\'\"]','',text).upper()
prepositions = open('data/prepositions.dat').read().upper().split()
for p in prepositions:
keywords = re.sub(r"\b{!s}\b".format(p),' ',keywords)
return keywords.encode('ascii','ignore').split()
#Identify good entries using naive_bayes object
def filter_content(content,
method,
naive_bayes,
keywords):
new_samples = [find_keywords(entry[method]) for entry in content]
#compute vector for each new entry
X = scipy.sparse.lil_matrix((len(new_samples),len(keywords)))
for j,kw in enumerate(keywords):
for i,ns in enumerate(new_samples):
X[i,j]=ns.count(kw)
categories = naive_bayes.predict(X)
return [e for c,e in zip(categories,content) if c =='good']
#Gather content from all sources (BibTex files, arXiv, journal RSS feeds, etc)
def get_content(sources):
all_content = list()
for src in sources:
try:
src.fetch()
except:
print("Fetch of content from {!r} has failed".format(src))
content = None
try:
print('parsing {!r}'.format(src))
content = src.parse()
except:
print("parsing of content from {!r} has failed".format(src))
if content:
all_content += content
return all_content
#Human review of content classification
#You can review all the content, or just one that the nb classifier thought were good.
#Human input is used to train the NB classifier.
def review_content(good_content,content,method,review_all=False):
to_review=[]
if review_all:
to_review = content
else:
to_review = good_content
human_class=[]
for entry in to_review:
print("Is \"{}\" a good entry?".format(entry[method].encode('ascii','ignore')))
decision = raw_input('Y/n?').lower()
human_class.append('good' if decision=='y' else 'bad')
return human_class, to_review
#Load in a trained naive_bayes object and keyword list
def load_knowledge(knowledge):
#existing naive_bayes object and keyword list
nb=None
kw=list()
if knowledge is not None:
if not os.path.isdir(knowledge):
print("Knowledge bust be a directory")
exit()
kfiles = glob.glob(knowledge+'/*')
if not any(f.endswith('nb.p') for f in kfiles):
print("Knowledge does not contain nb.p (pickled naive bayes object)")
exit()
if not any(f.endswith('kw.p') for f in kfiles):
print("Knowledge does not contain kw.p (pickled keyword list)")
exit()
else:
knowledge =os.path.expanduser('~/.shakespeare')
if os.path.exists(knowledge):
nb=pickle.load(open(knowledge+'/nb.p'))
kw=pickle.load(open(knowledge+'/kw.p'))
return(nb,kw, knowledge)
#Train naive_bayes object on a data set
def train(good_sources, bad_sources,method,naive_bayes=None,keywords=list()):
#train the algorithm
good_samples = find_keywords(' '.join([entry[method] for entry in good_sources]))
bad_samples = find_keywords(' '.join([entry[method] for entry in bad_sources]))
#if we have an exists knowledge base to append this new information to, do so
if naive_bayes:
new_kws = set(good_samples+bad_samples)
print('Using old keywords as well')
print("# old keywords = {}\n # new keywords = {}".format(len(keywords),len(new_kws)))
new_kws = set(good_samples+bad_samples).difference(keywords)
print("# fresh keywords = {}\n".format(len(new_kws)))
#make some call to naive_bayes.partial_fssit in here
X = np.concatenate((naive_bayes.feature_count_, np.zeros((naive_bayes.feature_count_.shape[0],len(new_kws)))),1)
all_kw = keywords + list(new_kws)
else:
print('Only using keywords from this content set')
all_kw = list(set(good_samples+bad_samples))
X = np.zeros((2,len(all_kw)))
for j,kw in enumerate(all_kw):
X[0,j] += good_samples.count(kw)
X[1,j] += bad_samples.count(kw)
y = ['good','bad']
naive_bayes = MultinomialNB()
naive_bayes.fit(X,y)
return naive_bayes, all_kw
#export content to simple markdown format
def to_markdown(content,output_file):
try:
with open(output_file,'w') as outf:
outf.write('# Relevant articles\n')
for article in content:
outf.write("## {}\n".format(re.sub(r'\n',' ',article['title']).encode('ascii','ignore')))
outf.write("* authors: {}\n".format(re.sub(r'\n',' ',article['author']).encode('ascii','ignore')))
outf.write("* abstract: {}\n".format(re.sub(r'\n',' ',article['abstract']).encode('ascii','ignore')))
outf.write("* [link]({})\n\n".format(re.sub(r'\n',' ',article['url']).encode('ascii','ignore')))
except:
print("Failed to write markdown file")
def main(argv):
#add command line options for sources, output prefs, database of "good" keywords
parser = argparse.ArgumentParser()
parser.add_argument('-o','--output', help='output file name. only supports markdown right now.',dest ='output',default=None)
parser.add_argument('-b','--bibtex', help='bibtex files to fetch',dest='bibfiles', nargs='*',default=list())
parser.add_argument('-j','--journals', help='journals to fetch. Currently supports {}.'.format(' '.join(rss.rss_feeds.keys())),
nargs='*',dest='journals',default=list())
parser.add_argument('-a','--arXiv', help='arXiv categories to fetch',
nargs='*',dest='arXiv',default=list())
parser.add_argument('--all_sources', help='flag to search from all sources.',action ='store_true')
parser.add_argument('--all_good_sources', help='flag to search from good sources. Specfied in your config file.',action ='store_true')
parser.add_argument('--train', help='flag to train. All sources beside "--train-input-good" are treated as bad/irrelevant papers',action ='store_true')
parser.add_argument('-g','--train_input_good', help='bibtex file containing relevant articles.',dest ='good_source',default=None)
parser.add_argument('-m','--method', help='Methods to try to find relevent papers. Right now, only all, title, author, and abstract are valid fields',
dest='method',default='title')
parser.add_argument('-k', '--knowledge',
help='path to database containing information about good and bad keywords. \
If you are training, you must specifiy this, as it will be where your output is written ',
dest='knowledge',default=None)
parser.add_argument('--overwrite-knowledge', help='flag to overwrite knowledge,if training',action ='store_true',default=False, dest='overwrite_knowledge')
parser.add_argument('--feedback', help='flag to give feedback after sorting content',action ='store_true',default=False, dest='feedback')
parser.add_argument('--review_all', help='review all the new selections. Otherwise, you will only review the good selections',action ='store_true',default=False, dest='review_all')
args = parser.parse_args(argv)
if not args.method in ['title','abstract','author, all']:
print("Invalid method. Options are title, abstract, author, and all")
exit()
method = args.method
#Set up training if that's what we're doing
if args.train:
#check to make sure we have a good training input
if args.good_source is None:
print("When training, you must specify one good source")
exit()
if not os.path.exists(args.good_source):
print("Specified training input does not exist")
exit()
if not os.path.isfile(args.good_source):
print("Specified training input is not a file")
exit()
if not os.path.splitext(args.good_source)[1] == '.bib' :
print("Training input must be in bibtex format")
exit()
#load the existing knowledge
nb,kw,knowledge = load_knowledge(args.knowledge)
if args.overwrite_knowledge:
nb=None
kw=list()
good_content = get_content([bibtex.BibTex(args.good_source)])
if args.all_sources:
bad_content = get_content([arxiv.ArXiv(cat) for cat in arxiv.arxiv_cats] +
[rss.JournalFeed(journal) for journal in rss.rss_feeds.keys()])
else:
bad_content = get_content([arxiv.ArXiv(cat) for cat in args.arXiv] +
[bibtex.BibTex(bibfile) for bibfile in args.bibfiles] +
[rss.JournalFeed(journal) for journal in args.journals])
#train, and write out knowledge (naive_bayes class and keywords)
nb, kw = train(good_content,bad_content,method,naive_bayes=nb, keywords=kw)
pickle.dump(nb,open(knowledge+'/nb.p','w'))
pickle.dump(kw,open(knowledge+'/kw.p','w'))
#we are filtering new content through our existing knowledge
else:
#load the old knowledge
nb,kw,knowledge = load_knowledge(args.knowledge)
if args.all_sources:
sources = [arxiv.ArXiv(cat) for cat in arxiv.arxiv_cats] + \
[rss.JournalFeed(journal) for journal in rss.rss_feeds.keys()]
elif args.all_good_sources:
arxiv_cats = ['cond-mat','stat']
journals = ['science','nature','small','prl','pnas',
'physreve','physrevx','acsnano',
'advmat','jchemphysb','natphys',
'natmat','natnano','langmuir']
sources = [arxiv.ArXiv(cat) for cat in arxiv_cats] + \
[rss.JournalFeed(journal) for journal in journals]
else:
sources = [arxiv.ArXiv(cat) for cat in args.arXiv] + \
[ bibtex.BibTex (bibfile) for bibfile in args.bibfiles] + \
[rss.JournalFeed(journal) for journal in args.journals]
new_content = get_content(sources)
good_content = filter_content(new_content,method,nb,kw)
print("Fraction of good new content: {!r}".format(len(good_content)*1.0/len(new_content)))
print("total content parsed: {!r}".format(len(new_content)))
if (args.output):
to_markdown(good_content,args.output)
else:
pass
#print(good_content)
if(args.feedback):
human_class, reviewed_content = review_content(good_content,new_content,method,args.review_all)
good_content = [entry for cat,entry in zip(human_class,reviewed_content) if cat=='good']
bad_content = [entry for cat,entry in zip(human_class,reviewed_content) if cat=='bad']
nb, kw = train(good_content,bad_content,method,naive_bayes=nb, keywords=kw)
pickle.dump(nb,open(knowledge+'/nb.p','w'))
pickle.dump(kw,open(knowledge+'/kw.p','w'))
if __name__=="__main__":
main(sys.argv[1:])