/
pleroma.py
92 lines (73 loc) · 2.75 KB
/
pleroma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import shared
def find_user(username, domain_name):
'''
Returns user id (str).
Parameters:
username (str): user name.
domain_name (str): domain name.
Returns:
user_id (str): User id.
In case of error:
Returns -1 (int).
'''
url = f'https://{domain_name}/api/v1/accounts/{username}'
json_user, status_code = shared.fetch(url)
json_user = json_user.text
if json_user == -1:
return -1
dict_data = shared.load_json(json_user)
if not isinstance(dict_data, dict):
sys.stderr.write('Error parsing JSON data.\n')
return -1
if 'id' in dict_data:
user_id = dict_data['id']
return user_id
else:
sys.stderr.write('No user id.\n')
return -1
def scrape(url, domain_name, user_id):
'''
Returns list of posts (list) and URL for the next url to scrape (str) as a tuple.
Parameters:
url (str): API URL.
domain_name (str): domain name.
user_id (str): User id.
Returns:
lst_out (list), url (str): List of posts and next url.
In case of error:
Returns -1, -1 (integers).
'''
# Example URLs:
# See the most recent posts.
# https://{domain_name}/api/v1/accounts/{user_id}/statuses?with_muted=true&limit=40&exclude_reblogs=true
# {max_id}: Only posts older than this post's id will be shown.
# https://{domain_name}/api/v1/accounts/{user_id}/statuses?max_id={max_id}&with_muted=true&limit=40&exclude_reblogs=true
lst_posts = []
lst_out = []
json_posts, status_code = shared.fetch(url)
if json_posts == -1:
return -1, -1
json_posts = json_posts.text
lst_posts = shared.load_json(json_posts)
if not isinstance(lst_posts, list):
sys.stderr.write('Broken list of posts.\n')
return -1, -1
if not lst_posts:
url = ''
return lst_posts, url
for post in lst_posts:
if isinstance(post, dict):
dict_post = {}
if 'created_at' in post.keys():
dict_post['datetime'] = post['created_at']
if 'content' in post.keys():
dict_post['content'] = post['content']
if 'url' in post.keys():
dict_post['url'] = post['url']
lst_out.append(dict_post)
# Use the id of the last post to keep fetching older posts.
last_post = lst_posts[-1]
if 'id' in last_post.keys():
max_id = last_post['id']
url = f'https://{domain_name}/api/v1/accounts/{user_id}/statuses?max_id={max_id}&with_muted=true&limit=40&exclude_reblogs=true'
return lst_out, url