forked from madwind/flexget_qbittorrent_mod
/
html_rss.py
123 lines (112 loc) · 4.39 KB
/
html_rss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from urllib.parse import urljoin
import chardet
from flexget import plugin
from flexget.entry import Entry
from flexget.event import event
from flexget.utils.soup import get_soup
from loguru import logger
from requests import RequestException
try:
import brotli
except ImportError:
brotli = None
class PluginHtmlRss():
schema = {
'type': 'object',
'properties': {
'url': {'type': 'string', 'format': 'url'},
'headers': {
'type': 'object',
'properties': {
'cookie': {'type': 'string'},
'user-agent': {'type': 'string'},
}
},
'params': {'type': 'string'},
"root_element_selector": {'type': 'string'},
'fields': {
'type': 'object',
'properties': {
'title': {
'type': 'object',
'properties': {
'element_selector': {'type': 'string'},
'attribute': {'type': 'string'},
}
},
'url': {
'type': 'object',
'properties': {
'element_selector': {'type': 'string'},
'attribute': {'type': 'string'},
},
}
},
'required': ['title', 'url'],
}
},
'required': ['url'],
'additionalProperties': False
}
def prepare_config(self, config):
config.setdefault('url', '')
config.setdefault('headers', {})
config.setdefault('params', '')
config.setdefault('root_element_selector', '')
config.setdefault('fields', {})
return config
def on_task_input(self, task, config):
config = self.prepare_config(config)
url = config.get('url')
root_element_selector = config.get('root_element_selector')
fields = config.get('fields')
params = config.get('params')
entries = []
elements = []
if url and root_element_selector:
try:
if brotli:
config.get('headers')['accept-encoding'] = 'gzip, deflate, br'
response = task.requests.get(url, headers=config.get('headers'), timeout=60)
content = self._decode(response)
except RequestException as e:
raise plugin.PluginError(
'Unable to download the Html for task {} ({}): {}'.format(task.name, url, e)
)
elements = get_soup(content).select(root_element_selector)
if len(elements) == 0:
return entries
for element in elements:
logger.debug('element in element_selector: {}', element)
entry = Entry()
for key, value in fields.items():
entry[key] = ''
sub_element = element.select_one(value['element_selector'])
if sub_element:
if value['attribute'] == 'textContent':
sub_element_content = sub_element.get_text()
else:
sub_element_content = sub_element.get(value['attribute'], '')
entry[key] = sub_element_content
logger.debug('key: {}, value: {}', key, entry[key])
if entry['title'] and entry['url']:
base_url = urljoin(url, entry['url'])
if params.startswith("&"):
entry['url'] = base_url + params
else:
entry['url'] = urljoin(base_url, params)
entry['original_url'] = entry['url']
entries.append(entry)
return entries
def _decode(self, response):
content = response.content
content_encoding = response.headers.get('content-encoding')
if content_encoding == 'br':
content = brotli.decompress(content)
charset_encoding = chardet.detect(content).get('encoding')
if charset_encoding == 'ascii':
charset_encoding = 'unicode_escape'
return content.decode(charset_encoding if charset_encoding else 'utf-8', 'ignore')
@event('plugin.register')
def register_plugin():
plugin.register(PluginHtmlRss, 'html_rss', api_ver=2)