def split_punctuation(s: str): assert isinstance(s, str) s = s.replace(' ', '').replace('\r', '') split_indices = concat( (i, i + 1) for i, c in enumerate(s) if not c.isalpha()) return split_at(s, *split_indices)
def text_to_words(raw_data_text: str) -> List[str]: def split_punctuation(s: str): assert isinstance(s, str) s = s.replace(' ', '').replace('\r', '') split_indices = concat( (i, i + 1) for i, c in enumerate(s) if not c.isalpha()) return split_at(s, *split_indices) def is_valid(s: str): return len(s) != 0 and not s.isspace() words = split_by( raw_data_text, '\n', ' ' ) # word here means more generic than string of characters: it's more of a string of values (values represented on keys) words = concat(split_punctuation(word) for word in words) words = [word for word in words if is_valid(word)] return words
async def __parseForward(self, message, new_message): if message.forward is not None: if message.forward.chat is not None: new_message.forward = message.forward.chat.username new_message.forwardId = message.forward.chat.id if new_message.forwardId is None: new_message.forwardId = message.forward.channel_id elif message.forward.sender is not None: sender = message.forward.sender new_message.forward = concat(sender.first_name, sender.last_name) new_message.forwardId = sender.id else: new_message.forward = "Unknown" new_message.forwardId = "Unknown" if message.forward.original_fwd is not None: new_message.forward_msg_id = message.forward.original_fwd.channel_post new_message.forward_msg_date = message.forward.date
def writeCsv(self): if len(self.messages) == 0: raise LookupError( "Nothing to write. You have to execute 'scrape' method first.") chatlogs_csv = self.path + "/chatlogs_" + str( datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) + ".csv" users_csv = self.path + "/users" + str( datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) + ".csv" # WRITE MESSAGES AND COMMENTS with open(chatlogs_csv, "w", encoding="utf-8", newline='') as chatFile: writer = csv.writer(chatFile) writer.writerow(Message.getMessageHeader()) for message in self.messages: message.urls = extractUrls(message) writer.writerow( message.getMessageRow(self.username, self.member_count, self.isBroadcastingChannel)) for comment in message.comments: comment.urls = extractUrls(comment) writer.writerow( comment.getMessageRow(self.username, self.member_count, self.isBroadcastingChannel)) with open(users_csv, "w", encoding="utf-8", newline='') as users_csv: writer = csv.writer(users_csv) writer.writerow(Message.getUserHeader()) for user in self.users: # Write in user table. writer.writerow([ self.username, user.id, user.first_name, user.last_name, concat(user.first_name, user.last_name), user.phone, user.bot, user.verified, user.username ])
if match is not None and match != fail: response = random.choice(responses) return sub_list(match, response) return None def compose_single_sentene(tree_words): results = [] for e in tree_words: if isinstance(e, list): results += compose_single_sentene(e) else: results.append(e) return results if __name__ == '__main__': while True: sentence = input('USER >>>') match = eliza(sentence) if match: print('{:>40}<<<'.format(concat(compose_single_sentene(match)))) else: print('sorry I don\'t know it') sentence = '三个人怎么读?' match = eliza(sentence) if '读' in match: cut(sentence) print(match)
async def parseMessage(self, message): # Wait to prevent getting blocked await wait() new_message = Message() new_message.id = message.id new_message.sender = message.sender_id try: first_name = message.sender.first_name last_name = message.sender.last_name except AttributeError: first_name = "" last_name = "" new_message.sender_name = concat(first_name, last_name) try: new_message.username = message.sender.username except AttributeError: pass new_message.replyToMessageId = message.reply_to_msg_id new_message.edit_date = message.edit_date new_message.entities = message.entities new_message.post_author = message.post_author new_message.timestamp = message.date new_message.text = message.text new_message.views = message.views new_message.media = type(message.media) self.member_count = message.chat.participants_count # Saves the channel from which the message was forwarded. try: await self.__parseForward(message, new_message) except AttributeError: pass if type(message.media ) == telethon.types.MessageMediaPhoto and Channel.config.get( "media_download"): mediapath = self.path + "/media/" + str(new_message.id) if not os.path.exists(mediapath + ".jpg"): try: await message.download_media(mediapath) except telethon.errors.FloodWaitError as e: logging.info("Waiting " + str(e.seconds) + " seconds: FloodWaitError") await asyncio.sleep(e.seconds) except telethon.errors.RpcCallFailError: pass await asyncio.sleep(1) # Checks which kind of comment bot is used by the provider of the group a uses the correct scraper. # --> then fills the comment list for each messages with the comments (prints "no comments" if no comment # bot is used. comments = list() if message.buttons is not None and message.forward is None: buttons = message.buttons for button in buttons: button_url = None try: button_url = button[0].button.url[:21] except AttributeError: pass if button_url == 'https://comments.bot/': logging.info("---> Found comments.bot...") new_message.hasComments = True new_message.bot_url = button[0].button.url try: comments.extend( Bots.scrapeCommentsBot(new_message.bot_url, self.users, message.id)) except Exception: traceback.print_exc() elif button[0].text[-8:] == 'comments': logging.info("---> Found comments.app...") new_message.hasComments = True new_message.bot_url = button[0].button.url try: commentsAppComments, commentsAppUsers = \ await Bots.scrapeCommentsApp(new_message.bot_url, message.id, Channel.config.get("query_users")) comments.extend(commentsAppComments) self.users.extend(commentsAppUsers) except Exception: traceback.print_exc() new_message.comments = comments self.messages.append(new_message)
def __parseCommentFromApp(comment, commentList, userList, count, messageId, queryUser): # Save comment Message new_comment = Message() new_comment.isComment = True new_comment.parent = messageId new_comment.id = str(messageId) + "." + str(count) # Get all text elements of a comment textList = None try: textList = comment.find_all('div', class_='bc-comment-text') except: logging.info("This thread has no comments") # Save comment text and reply id try: # Find comment text and quoted comment id. if len(textList) > 1: try: new_comment.text = textList[1].text new_comment.replyToMessageId = next( (new_comment for new_comment in commentList if new_comment.text == textList[0].text), None).id except AttributeError: logging.info("Could not find quoted comment" + ", MessageId: " + str(new_comment.id)) # Find only comment text else: try: new_comment.text = textList[0].text except IndexError: logging.info("Could not find comment text" + ", MessageId: " + str(messageId)) except: traceback.print_exc() logging.info("An error occurred reading comment text" + ", MessageId: " + str(messageId)) # Find not identified User new_comment.sender_name = comment.find( 'span', class_='bc-comment-author-name').contents[0].contents[0] # Find identified User and save User ich channel.users and save message.sendername try: identifierName = comment.find( 'span', class_='bc-comment-author-name').contents[0]['href'].rsplit( '/', 1)[-1] except: identifierName = "" user = None # Query user if identifier name was found if not identifierName == "" and queryUser: try: user = Bots.getEntity(identifierName) userList.append(user) except ValueError: # User for some reason not found. pass new_comment.username = identifierName if user is not None: new_comment.sender_name = concat(user.first_name, user.last_name) new_comment.sender = user.id else: commentUser = telethon.types.User(0) commentUser.first_name = new_comment.sender_name new_comment.timestamp = comment.find('time')['datetime'] commentList.append(new_comment)