/
summarise.py
141 lines (108 loc) · 3.55 KB
/
summarise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
'''
summarise url (s) and other web resources
'''
from newspaper import Article, Config
import sys
CURR_PLATFORM = sys.platform
if CURR_PLATFORM == 'linux':
pass # fixme: for linux
else:
sys.path.insert(0, 'U:\Documents\Project\demoapptwitter')
import config
def get_top_img(url):
'''
Get just the top img from a URL
'''
article = Article(url)
# configuration for Newspaper to minimize processing time
configure = Config()
try:
article.download()
article.parse()
except:
print(url)
return False
return article.top_image
def summarise_one(url, title=True, keywords=True, summary=False, \
top_img_src=False):
'''
Get url and return summary
'''
article = Article(url)
# configuration for Newspaper to minimize processing time
configure = Config()
configure.fetch_images = False
configure.MAX_SUMMARY = 300
configure.MAX_SUMMARY_SENT = 3
try:
article.download()
article.parse()
except:
print(url)
title = article.title
if keywords or summary:
try:
article.nlp()
if keywords:
keywords = article.keywords
if summary:
summary = article.summary
except :
print('NEwspaper error with nlp() call')
if top_img_src:
top_img_src = article.top_image
return title, keywords, summary, top_img_src
def summarise_watson_img(src, options=False):
'''
IBM Bluemix service
intersted in default clasifier: images.classifiers.classes
'''
import json
from watson_developer_cloud import VisualRecognitionV3 as VisualRecognition
classifier = VisualRecognition( '2016-05-20', \
api_key=config.IBM['credentials']['api_key'])
result = classifier.classify(images_url=src)
# debug
print(json.dumps(result, indent=2))
return result
def summarise_img(src, options=False):
'''
Retrieve meta-data for an image web resource.
Use algorithmia, openshift or similar cloud service.
'''
import Algorithmia
client = Algorithmia.client(config.ALGORITHMIA['api_key'])
algo = client.algo('deeplearning/IllustrationTagger/0.2.3')
input = {"image":src}
if options:
# tags (optional) required probs
for opt, value in options.items():
input[opt] = value
# e.g. threshold 0.3 etc
result = algo.pipe(input)
return result
def get_vec_img():
# keywords = db.rawtweets.find_one({'img':{'$exists':True}},
# {'img.keywords':1, '_id':0})
# retrieves in this format:
# { "img" : { "keywords" : [ { "photo" : 0.8888001441955565 },
# { "formal" : 0.651865303516388 },
#keywords
pass
# for testing use:
if __name__ == '__main__':
url = input('Enter URL: ')
if(url[-3:]) not in ['jpg','png','gif']:
output = summarise_one(url, False, False, True, True)
title, keywords, summary, top_img_src = output
print('title', title)
print('keywords', keywords)
if type(summary) is not bool:
print('summary', summary.encode('utf-8'))
print('top_img_src', top_img_src)
else:
#'threshold':0.1,
# options = {'tags':[]}
# options = { 'tags':['tree','window','face','lips', 'sky']}
output = summarise_watson_img(url)
#print(output.result)