forked from itzmeanjan/tgnize
-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
156 lines (128 loc) · 4.57 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/python3
from __future__ import annotations
from os import listdir
from os.path import join, abspath
from typing import List, Tuple
from model.chat import Chat
from model.message import Message
from model.event import Event
from re import compile as reg_compile
from time import time
try:
from bs4 import BeautifulSoup
from bs4.element import Tag
except ImportError as e:
print('[!]Module Unavailable : {}'.format(str(e)))
exit(1)
def handleEvent(tag: Tag, chat: Chat):
chat.push(
Event(
int(tag.get('id').replace('message', '')),
tag.find('div', attrs={'class': 'body details'}).getText().strip()
)
)
def handleMessage(tag: Tag, chat: Chat, prev_tag: Tag = None):
if not prev_tag:
txt = tag.find('div', attrs={'class': 'text'})
reply_to = tag.find(
'div', attrs={'class': 'reply_to_details'})
fromUser = tag.find('div', attrs={
'class': 'from_name'}).getText().strip()
chat.push(
Message(
int(tag.get('id').replace('message', '')),
chat.extractUserAndBotNameFromMessage(fromUser) if chat.isAViaBotMessage(fromUser) else (fromUser, None),
txt.getText().strip() if txt else None,
tag.find('div', attrs={'class': 'pull_right date details'}).get(
'title'),
int(reply_to.a.get('href')) if reply_to else None
)
)
chat.updateUserRecords(
fromUser,
int(tag.get('id').replace('message', '')))
else:
txt = prev_tag.find('div', attrs={'class': 'text'})
fromUser = prev_tag.find('div', attrs={
'class': 'from_name'}).getText().strip()
chat.push(
Message(
int(tag.get('id').replace('message', '')),
chat.extractUserAndBotNameFromMessage(fromUser) if chat.isAViaBotMessage(fromUser) else (fromUser, None),
txt.getText().strip() if txt else None,
tag.find('div', attrs={'class': 'pull_right date details'}).get(
'title')
)
)
chat.updateUserRecords(
fromUser,
int(tag.get('id').replace('message', '')))
'''
Passes an extracted tag value to
one handler function, which
can handle this activity by
considering it either
'''
def routeToProperHandler(tag: Tag, prev_tag: Tag, chat: Chat) -> bool:
if tag.get('class') == ['message', 'service']:
handleEvent(tag, chat)
return False
elif tag.get('class') == ['message', 'default', 'clearfix']:
handleMessage(tag, chat)
return True
else:
handleMessage(tag, chat, prev_tag=prev_tag)
return False
'''
Extracts all possible activities happened in Chat,
including message sent or people joined/ left
in case of group chat etc.
And returns a list of those tags.
'''
def getAllActivities(tree: BeautifulSoup) -> List[Tag]:
reg = reg_compile(r'^(message[0-9]{1,})$')
tmp = tree.findAll('div',
attrs={'class': 'message default clearfix'})
tmp.extend(tree.findAll('div',
attrs={'class': 'message default clearfix joined'}))
tmp.extend(
[
i
for i in tree.findAll('div', attrs={'class': 'message service'})
if reg.match(i.get('id'))
]
)
return tmp
'''
Reads whole content ( html file content)
of requested filepath & return so
'''
def getFileContent(targetPath: str) -> str:
with open(targetPath, mode='r') as fd:
return fd.read()
'''
Returns a collection of file paths
which are holding exported telegram chat
( i.e. group or private ), *.html files,
present under ./data
'''
def getChatFiles(targetPath: str) -> List[str]:
return [join(abspath(targetPath), i)
for i in listdir(targetPath)
if i.startswith('messages') and i.endswith('html')]
'''
Parses all exported telegram chat
files ( html files ) and builds chat object,
holding organized information regarding whole chat ( private/ group )
'''
def parseChat(targetPath: str) -> Chat:
chat = Chat()
last_msg_with_author = None
for i in getChatFiles(targetPath):
for j in getAllActivities(BeautifulSoup(getFileContent(i), features='lxml')):
if routeToProperHandler(j, last_msg_with_author, chat):
last_msg_with_author = j
return chat
if __name__ == '__main__':
print('[!]This module is designed to be used as a backend handler')
exit(0)