Example #1
0
    def split_punctuation(s: str):
        assert isinstance(s, str)
        s = s.replace(' ', '').replace('\r', '')
        split_indices = concat(
            (i, i + 1) for i, c in enumerate(s) if not c.isalpha())

        return split_at(s, *split_indices)
Example #2
0
def text_to_words(raw_data_text: str) -> List[str]:
    def split_punctuation(s: str):
        assert isinstance(s, str)
        s = s.replace(' ', '').replace('\r', '')
        split_indices = concat(
            (i, i + 1) for i, c in enumerate(s) if not c.isalpha())

        return split_at(s, *split_indices)

    def is_valid(s: str):
        return len(s) != 0 and not s.isspace()

    words = split_by(
        raw_data_text, '\n', ' '
    )  # word here means more generic than string of characters: it's more of a string of values (values represented on keys)
    words = concat(split_punctuation(word) for word in words)
    words = [word for word in words if is_valid(word)]
    return words
Example #3
0
    async def __parseForward(self, message, new_message):
        if message.forward is not None:
            if message.forward.chat is not None:
                new_message.forward = message.forward.chat.username
                new_message.forwardId = message.forward.chat.id
                if new_message.forwardId is None:
                    new_message.forwardId = message.forward.channel_id
            elif message.forward.sender is not None:
                sender = message.forward.sender
                new_message.forward = concat(sender.first_name,
                                             sender.last_name)
                new_message.forwardId = sender.id
            else:
                new_message.forward = "Unknown"
                new_message.forwardId = "Unknown"

            if message.forward.original_fwd is not None:
                new_message.forward_msg_id = message.forward.original_fwd.channel_post
                new_message.forward_msg_date = message.forward.date
Example #4
0
    def writeCsv(self):
        if len(self.messages) == 0:
            raise LookupError(
                "Nothing to write. You have to execute 'scrape' method first.")

        chatlogs_csv = self.path + "/chatlogs_" + str(
            datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) + ".csv"
        users_csv = self.path + "/users" + str(
            datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) + ".csv"

        # WRITE MESSAGES AND COMMENTS
        with open(chatlogs_csv, "w", encoding="utf-8", newline='') as chatFile:
            writer = csv.writer(chatFile)
            writer.writerow(Message.getMessageHeader())
            for message in self.messages:
                message.urls = extractUrls(message)
                writer.writerow(
                    message.getMessageRow(self.username, self.member_count,
                                          self.isBroadcastingChannel))

                for comment in message.comments:
                    comment.urls = extractUrls(comment)
                    writer.writerow(
                        comment.getMessageRow(self.username, self.member_count,
                                              self.isBroadcastingChannel))

        with open(users_csv, "w", encoding="utf-8", newline='') as users_csv:
            writer = csv.writer(users_csv)
            writer.writerow(Message.getUserHeader())
            for user in self.users:
                # Write in user table.
                writer.writerow([
                    self.username, user.id, user.first_name, user.last_name,
                    concat(user.first_name, user.last_name), user.phone,
                    user.bot, user.verified, user.username
                ])
        if match is not None and match != fail:
            response = random.choice(responses)
            return sub_list(match, response)
    return None


def compose_single_sentene(tree_words):
    results = []
    for e in tree_words:
        if isinstance(e, list):
            results += compose_single_sentene(e)
        else:
            results.append(e)
    return results


if __name__ == '__main__':
    while True:
        sentence = input('USER >>>')
        match = eliza(sentence)
        if match:
            print('{:>40}<<<'.format(concat(compose_single_sentene(match))))
        else:
            print('sorry I don\'t know it')

sentence = '三个人怎么读?'
match = eliza(sentence)
if '读' in match:
    cut(sentence)
    print(match)
Example #6
0
    async def parseMessage(self, message):
        # Wait to prevent getting blocked
        await wait()

        new_message = Message()
        new_message.id = message.id
        new_message.sender = message.sender_id
        try:
            first_name = message.sender.first_name
            last_name = message.sender.last_name
        except AttributeError:
            first_name = ""
            last_name = ""
        new_message.sender_name = concat(first_name, last_name)
        try:
            new_message.username = message.sender.username
        except AttributeError:
            pass
        new_message.replyToMessageId = message.reply_to_msg_id
        new_message.edit_date = message.edit_date
        new_message.entities = message.entities
        new_message.post_author = message.post_author
        new_message.timestamp = message.date
        new_message.text = message.text
        new_message.views = message.views
        new_message.media = type(message.media)
        self.member_count = message.chat.participants_count

        # Saves the channel from which the message was forwarded.
        try:
            await self.__parseForward(message, new_message)
        except AttributeError:
            pass

        if type(message.media
                ) == telethon.types.MessageMediaPhoto and Channel.config.get(
                    "media_download"):
            mediapath = self.path + "/media/" + str(new_message.id)
            if not os.path.exists(mediapath + ".jpg"):
                try:
                    await message.download_media(mediapath)
                except telethon.errors.FloodWaitError as e:
                    logging.info("Waiting " + str(e.seconds) +
                                 " seconds: FloodWaitError")
                    await asyncio.sleep(e.seconds)
                except telethon.errors.RpcCallFailError:
                    pass
                await asyncio.sleep(1)

        # Checks which kind of comment bot is used by the provider of the group a uses the correct scraper.
        #   --> then fills the comment list for each messages with the comments (prints "no comments" if no comment
        #   bot is used.
        comments = list()
        if message.buttons is not None and message.forward is None:
            buttons = message.buttons

            for button in buttons:
                button_url = None
                try:
                    button_url = button[0].button.url[:21]
                except AttributeError:
                    pass

                if button_url == 'https://comments.bot/':
                    logging.info("---> Found comments.bot...")
                    new_message.hasComments = True
                    new_message.bot_url = button[0].button.url
                    try:
                        comments.extend(
                            Bots.scrapeCommentsBot(new_message.bot_url,
                                                   self.users, message.id))
                    except Exception:
                        traceback.print_exc()
                elif button[0].text[-8:] == 'comments':
                    logging.info("---> Found comments.app...")
                    new_message.hasComments = True
                    new_message.bot_url = button[0].button.url
                    try:
                        commentsAppComments, commentsAppUsers = \
                            await Bots.scrapeCommentsApp(new_message.bot_url, message.id,
                                                         Channel.config.get("query_users"))
                        comments.extend(commentsAppComments)
                        self.users.extend(commentsAppUsers)
                    except Exception:
                        traceback.print_exc()

            new_message.comments = comments
        self.messages.append(new_message)
Example #7
0
    def __parseCommentFromApp(comment, commentList, userList, count, messageId,
                              queryUser):
        # Save comment Message
        new_comment = Message()
        new_comment.isComment = True
        new_comment.parent = messageId
        new_comment.id = str(messageId) + "." + str(count)

        # Get all text elements of a comment
        textList = None
        try:
            textList = comment.find_all('div', class_='bc-comment-text')
        except:
            logging.info("This thread has no comments")

        # Save comment text and reply id
        try:
            # Find comment text and quoted comment id.
            if len(textList) > 1:
                try:
                    new_comment.text = textList[1].text
                    new_comment.replyToMessageId = next(
                        (new_comment for new_comment in commentList
                         if new_comment.text == textList[0].text), None).id
                except AttributeError:
                    logging.info("Could not find quoted comment" +
                                 ", MessageId: " + str(new_comment.id))
            # Find only comment text
            else:
                try:
                    new_comment.text = textList[0].text
                except IndexError:
                    logging.info("Could not find comment text" +
                                 ", MessageId: " + str(messageId))

        except:
            traceback.print_exc()
            logging.info("An error occurred reading comment text" +
                         ", MessageId: " + str(messageId))

        # Find not identified User
        new_comment.sender_name = comment.find(
            'span', class_='bc-comment-author-name').contents[0].contents[0]
        # Find identified User and save User ich channel.users and save message.sendername

        try:
            identifierName = comment.find(
                'span',
                class_='bc-comment-author-name').contents[0]['href'].rsplit(
                    '/', 1)[-1]
        except:
            identifierName = ""
        user = None

        # Query user if identifier name was found
        if not identifierName == "" and queryUser:
            try:
                user = Bots.getEntity(identifierName)
                userList.append(user)
            except ValueError:
                # User for some reason not found.
                pass

        new_comment.username = identifierName
        if user is not None:
            new_comment.sender_name = concat(user.first_name, user.last_name)
            new_comment.sender = user.id
        else:
            commentUser = telethon.types.User(0)
            commentUser.first_name = new_comment.sender_name

        new_comment.timestamp = comment.find('time')['datetime']
        commentList.append(new_comment)