Python Tokenizer Beispiele, core.tokenizer.Tokenizer Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: teste_all.py Projekt: leonamtv/html-parser

    def teste_d(self):
        nome = './samples/teste4.html'
        print('Testando o arquivo ' + nome)
        file = open(nome, 'r')
        html_content = file.read()
        file.close()

        tokenizer = Tokenizer()
        tokenizer.feed(html_content)
        tokens = tokenizer.get_fila()

        temp_stdout = StringIO()
        with contextlib.redirect_stdout(temp_stdout):
            _ = Arvore(tokens=list(tokens), verbose=True)

        output = temp_stdout.getvalue().strip()

        expected_output = """[ < ABRE TAG: link > ] -> { }
[ < ABRE TAG: head > ] -> { [ < ABRE TAG: link > ]  }
[ < DADO: Teste > ] -> { }
[ < ABRE TAG: p > ] -> { [ < DADO: Teste > ]  }
[ < DADO: asdasd > ] -> { }
[ < ABRE TAG: h1 > ] -> { [ < DADO: asdasd > ]  }
[ < ABRE TAG: body > ] -> { [ < ABRE TAG: p > ] [ < ABRE TAG: h1 > ]  }
[ < ABRE TAG: html > ] -> { [ < ABRE TAG: head > ] [ < ABRE TAG: body > ]  }
[ documento ] -> { [ < ABRE TAG: html > ]  }"""

        self.assertEqual(output, expected_output)

Beispiel #2

0

Datei anzeigen

 def __init__(self, **options):
     super().__init__(**options)
     self.temperature = config.temperature
     self.tokenizer = Tokenizer()
     self.tokenizer.load_vocab_from_file(config.vocab_file)
     self.channel_deques = {}
     self.custom_emoji_collection = []

Beispiel #3

0

Datei anzeigen

Datei: teste_all.py Projekt: leonamtv/html-parser

    def teste_i(self):
        nome = './samples/teste9.html'
        print('Testando o arquivo ' + nome)
        file = open(nome, 'r')
        html_content = file.read()
        file.close()

        tokenizer = Tokenizer()
        tokenizer.feed(html_content)
        tokens = tokenizer.get_fila()

        temp_stdout = StringIO()
        with contextlib.redirect_stdout(temp_stdout):
            _ = Arvore(tokens=list(tokens), verbose=True)

        output = temp_stdout.getvalue().strip()

        expected_output = """[ < DECL TAG: DOCTYPE html > ] -> { }
[ < DADO: Titulo de nivel 1 > ] -> { }
[ < ABRE TAG: h1 > ] -> { [ < DADO: Titulo de nivel 1 > ]  }
[ < DADO: Titulo de nivel 2 > ] -> { }
[ < ABRE TAG: h2 > ] -> { [ < DADO: Titulo de nivel 2 > ]  }
[ < DADO: Titulo de nivel 3 > ] -> { }
[ < ABRE TAG: h3 > ] -> { [ < DADO: Titulo de nivel 3 > ]  }
[ < DADO: Titulo de nivel 4 > ] -> { }
[ < ABRE TAG: h4 > ] -> { [ < DADO: Titulo de nivel 4 > ]  }
[ < DADO: Titulo de nivel 5 > ] -> { }
[ < ABRE TAG: h5 > ] -> { [ < DADO: Titulo de nivel 5 > ]  }
[ < DADO: Titulo de nivel 6 > ] -> { }
[ < ABRE TAG: h6 > ] -> { [ < DADO: Titulo de nivel 6 > ]  }
[ < ABRE TAG: img | src = img_chania.jpg | alt = Flowers in Chania > ] -> { }
[ < ABRE TAG: html > ] -> { [ < ABRE TAG: h1 > ] [ < ABRE TAG: h2 > ] [ < ABRE TAG: h3 > ] [ < ABRE TAG: h4 > ] [ < ABRE TAG: h5 > ] [ < ABRE TAG: h6 > ] [ < ABRE TAG: img | src = img_chania.jpg | alt = Flowers in Chania > ]  }
[ documento ] -> { [ < DECL TAG: DOCTYPE html > ] [ < ABRE TAG: html > ]  }"""

        self.assertEqual(output, expected_output)

Beispiel #4

0

Datei anzeigen

Datei: teste_all.py Projekt: leonamtv/html-parser

    def teste_k(self):
        print('Testando um arquivo vazio')
        tokenizer = Tokenizer()
        tokenizer.feed('')
        tokens = tokenizer.get_fila()

        temp_stdout = StringIO()
        with contextlib.redirect_stdout(temp_stdout):
            _ = Arvore(tokens=list(tokens), verbose=True)

        output = temp_stdout.getvalue().strip()

        expected_output = """Lista de tokens vazia."""

        self.assertEqual(output, expected_output)

Beispiel #5

0

Datei anzeigen

Datei: teste_all.py Projekt: leonamtv/html-parser

    def teste_g(self):
        nome = './samples/teste7.html'
        print('Testando o arquivo ' + nome)
        file = open(nome, 'r')
        html_content = file.read()
        file.close()

        tokenizer = Tokenizer()
        tokenizer.feed(html_content)
        tokens = tokenizer.get_fila()

        temp_stdout = StringIO()
        with contextlib.redirect_stdout(temp_stdout):
            _ = Arvore(tokens=list(tokens), verbose=True)

        output = temp_stdout.getvalue().strip()

        expected_output = """[ < DECL TAG: doctype html > ] -> { }
[ < ABRE TAG: meta | charset = utf-8 > ] -> { }
[ < ABRE TAG: link | href = https://fonts.googleapis.com/css2?family=Montserrat:ital,wght@0,100;0,200;0,300;0,400;0,500;0,600;0,700;0,800;0,900;1,100;1,200;1,300;1,400;1,500;1,600;1,700;1,800;1,900&display=swap | rel = stylesheet > ] -> { }
[ < ABRE TAG: link | href = https://fonts.googleapis.com/css2?family=Open+Sans:ital,wght@0,300;0,400;0,600;0,700;0,800;1,300;1,400;1,600;1,700;1,800&display=swap | rel = stylesheet > ] -> { }
[ < DADO: Leonamtv > ] -> { }
[ < ABRE TAG: title > ] -> { [ < DADO: Leonamtv > ]  }
[ < ABRE TAG: base | href = https://leonamtv.github.io/leonamtv/ > ] -> { }
[ < ABRE TAG: meta | name = viewport | content = width=device-width, initial-scale=1 > ] -> { }
[ < ABRE TAG: link | rel = icon | type = image/x-icon | href = assets/favicon/favicon.ico > ] -> { }
[ < ABRE TAG: link | rel = apple-touch-icon | sizes = 180x180 | href = assets/favicon/apple-touch-icon.png > ] -> { }
[ < ABRE TAG: link | rel = icon | type = image/png | sizes = 32x32 | href = assets/favicon/favicon-32x32.png > ] -> { }
[ < ABRE TAG: link | rel = icon | type = image/png | sizes = 16x16 | href = assets/favicon/favicon-16x16.png > ] -> { }
[ < ABRE TAG: link | rel = stylesheet | href = styles.1a2dd1f7fc237001a3e3.css > ] -> { }
[ < ABRE TAG: head > ] -> { [ < ABRE TAG: meta | charset = utf-8 > ] [ < ABRE TAG: link | href = https://fonts.googleapis.com/css2?family=Montserrat:ital,wght@0,100;0,200;0,300;0,400;0,500;0,600;0,700;0,800;0,900;1,100;1,200;1,300;1,400;1,500;1,600;1,700;1,800;1,900&display=swap | rel = stylesheet > ] [ < ABRE TAG: link | href = https://fonts.googleapis.com/css2?family=Open+Sans:ital,wght@0,300;0,400;0,600;0,700;0,800;1,300;1,400;1,600;1,700;1,800&display=swap | rel = stylesheet > ] [ < ABRE TAG: title > ] [ < ABRE TAG: base | href = https://leonamtv.github.io/leonamtv/ > ] [ < ABRE TAG: meta | name = viewport | content = width=device-width, initial-scale=1 > ] [ < ABRE TAG: link | rel = icon | type = image/x-icon | href = assets/favicon/favicon.ico > ] [ < ABRE TAG: link | rel = apple-touch-icon | sizes = 180x180 | href = assets/favicon/apple-touch-icon.png > ] [ < ABRE TAG: link | rel = icon | type = image/png | sizes = 32x32 | href = assets/favicon/favicon-32x32.png > ] [ < ABRE TAG: link | rel = icon | type = image/png | sizes = 16x16 | href = assets/favicon/favicon-16x16.png > ] [ < ABRE TAG: link | rel = stylesheet | href = styles.1a2dd1f7fc237001a3e3.css > ]  }
[ < DADO: Batata > ] -> { }
[ < ABRE TAG: p > ] -> { [ < DADO: Batata > ]  }
[ < ABRE TAG: img | src = asad > ] -> { }
[ < ABRE TAG: script | src = runtime-es2015.f8b979f66300b1e53384.js | type = module > ] -> { }
[ < ABRE TAG: script | src = runtime-es5.f8b979f66300b1e53384.js | nomodule | defer > ] -> { }
[ < ABRE TAG: script | src = polyfills-es5.854eca2125f3bf6856f8.js | nomodule | defer > ] -> { }
[ < ABRE TAG: script | src = polyfills-es2015.a2c1af2b1be41024173b.js | type = module > ] -> { }
[ < ABRE TAG: script | src = main-es2015.04368310fc69b15b5f08.js | type = module > ] -> { }
[ < ABRE TAG: script | src = main-es5.04368310fc69b15b5f08.js | nomodule | defer > ] -> { }
[ < ABRE TAG: body > ] -> { [ < ABRE TAG: p > ] [ < ABRE TAG: img | src = asad > ] [ < ABRE TAG: script | src = runtime-es2015.f8b979f66300b1e53384.js | type = module > ] [ < ABRE TAG: script | src = runtime-es5.f8b979f66300b1e53384.js | nomodule | defer > ] [ < ABRE TAG: script | src = polyfills-es5.854eca2125f3bf6856f8.js | nomodule | defer > ] [ < ABRE TAG: script | src = polyfills-es2015.a2c1af2b1be41024173b.js | type = module > ] [ < ABRE TAG: script | src = main-es2015.04368310fc69b15b5f08.js | type = module > ] [ < ABRE TAG: script | src = main-es5.04368310fc69b15b5f08.js | nomodule | defer > ]  }
[ < ABRE TAG: html | lang = en > ] -> { [ < ABRE TAG: head > ] [ < ABRE TAG: body > ]  }
[ documento ] -> { [ < DECL TAG: doctype html > ] [ < ABRE TAG: html | lang = en > ]  }"""

        self.assertEqual(output, expected_output)

Beispiel #6

0

Datei anzeigen

Datei: teste_all.py Projekt: leonamtv/html-parser

    def teste_b(self):
        nome = './samples/teste2.html'
        print('Testando o arquivo ' + nome)
        file = open(nome, 'r')
        html_content = file.read()
        file.close()

        tokenizer = Tokenizer()
        tokenizer.feed(html_content)
        tokens = tokenizer.get_fila()

        temp_stdout = StringIO()
        with contextlib.redirect_stdout(temp_stdout):
            _ = Arvore(tokens=list(tokens), verbose=True)

        output = temp_stdout.getvalue().strip()

        expected_output = """Erro: Tag li deve ser declarada dentro de uma dessas tags: ['ul', 'ol']"""

        self.assertEqual(output, expected_output)

Beispiel #7

0

Datei anzeigen

Datei: teste_all.py Projekt: leonamtv/html-parser

    def teste_j(self):
        nome = './samples/teste10.html'
        print('Testando o arquivo ' + nome)
        file = open(nome, 'r')
        html_content = file.read()
        file.close()

        tokenizer = Tokenizer()
        tokenizer.feed(html_content)
        tokens = tokenizer.get_fila()

        temp_stdout = StringIO()
        with contextlib.redirect_stdout(temp_stdout):
            _ = Arvore(tokens=list(tokens), verbose=True)

        output = temp_stdout.getvalue().strip()

        expected_output = """Erro: O token < ABRE TAG: body > não pode ser aberto mais de uma vez."""

        self.assertEqual(output, expected_output)

Beispiel #8

0

Datei anzeigen

Datei: teste_all.py Projekt: leonamtv/html-parser

    def teste_f(self):
        nome = './samples/teste6.html'
        print('Testando o arquivo ' + nome)
        file = open(nome, 'r')
        html_content = file.read()
        file.close()

        tokenizer = Tokenizer()
        tokenizer.feed(html_content)
        tokens = tokenizer.get_fila()

        temp_stdout = StringIO()
        with contextlib.redirect_stdout(temp_stdout):
            _ = Arvore(tokens=list(tokens), verbose=True)

        output = temp_stdout.getvalue().strip()

        expected_output = """Erro: Tag < FECHA TAG: p > fechada e não aberta"""

        self.assertEqual(output, expected_output)

Beispiel #9

0

Datei anzeigen

Datei: teste_all.py Projekt: leonamtv/html-parser

    def teste_k(self):
        nome = './samples/teste11.html'
        print('Testando o arquivo ' + nome)
        file = open(nome, 'r')
        html_content = file.read()
        file.close()

        tokenizer = Tokenizer()
        tokenizer.feed(html_content)
        tokens = tokenizer.get_fila()

        temp_stdout = StringIO()
        with contextlib.redirect_stdout(temp_stdout):
            _ = Arvore(tokens=list(tokens), verbose=True)

        output = temp_stdout.getvalue().strip()

        expected_output = """[ < DECL TAG: DOCTYPE html > ] -> { }
[ < ABRE TAG: meta | charset = UTF-8 > ] -> { }
[ < ABRE TAG: meta | name = viewport | content = width=device-width, initial-scale=1.0 > ] -> { }
[ < DADO: Document > ] -> { }
[ < ABRE TAG: title > ] -> { [ < DADO: Document > ]  }
[ < ABRE TAG: head > ] -> { [ < ABRE TAG: meta | charset = UTF-8 > ] [ < ABRE TAG: meta | name = viewport | content = width=device-width, initial-scale=1.0 > ] [ < ABRE TAG: title > ]  }
[ < DADO: Teste > ] -> { }
[ < ABRE TAG: h1 > ] -> { [ < DADO: Teste > ]  }
[ < DADO: Teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste
    teste teste teste teste teste teste teste teste teste teste > ] -> { }
[ < ABRE TAG: br > ] -> { }
[ < DADO: Teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste
    teste teste teste teste teste teste teste teste teste teste > ] -> { }
[ < ABRE TAG: body > ] -> { [ < ABRE TAG: h1 > ] [ < DADO: Teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste
    teste teste teste teste teste teste teste teste teste teste > ] [ < ABRE TAG: br > ] [ < DADO: Teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste teste
    teste teste teste teste teste teste teste teste teste teste > ]  }
[ < ABRE TAG: html | lang = en > ] -> { [ < ABRE TAG: head > ] [ < ABRE TAG: body > ]  }
[ documento ] -> { [ < DECL TAG: DOCTYPE html > ] [ < ABRE TAG: html | lang = en > ]  }"""

        self.assertEqual(output, expected_output)

Beispiel #10

0

Datei anzeigen

class MainClient(discord.Client):
    def __init__(self, **options):
        super().__init__(**options)
        self.temperature = config.temperature
        self.tokenizer = Tokenizer()
        self.tokenizer.load_vocab_from_file(config.vocab_file)
        self.channel_deques = {}
        self.custom_emoji_collection = []

    @staticmethod
    def decision(probability):
        return random.random() < probability

    def load_custom_emoji_collection(self):
        self.custom_emoji_collection.clear()
        guilds = list(self.guilds)
        for guild in guilds:
            self.custom_emoji_collection.extend(guild.emojis)
        log("Коллекция кастомных emoji обновлена.")

    def random_emoji(self):
        return (str(random.choice(self.custom_emoji_collection))
                if self.custom_emoji_collection else "")

    async def on_ready(self):
        log(f"Подключение к Discord успешно под пользователем @{self.user}.")
        self.load_custom_emoji_collection()
        game = discord.Game(config.discord_game_name)
        await self.change_presence(activity=game)

    async def on_guild_join(self, guild):
        await self.wait_until_ready()
        log(f"Зашел на сервер {guild.name}.")
        self.load_custom_emoji_collection()

    async def on_guild_remove(self, guild):
        await self.wait_until_ready()
        log(f"Вышел с сервера {guild.name}.")
        self.load_custom_emoji_collection()

    async def on_guild_emojis_update(self, guild, before, after):
        await self.wait_until_ready()
        log(f"На сервере {guild.name} изменилась коллекция emoji.")
        if len(before) != len(after):
            self.load_custom_emoji_collection()

    async def handle_command(self, message):
        # Команда изменения температуры семплирования
        # Не стали использовать discord.ext.commands, т.к. это единственная команда на данный момент
        # Потом добавим, если потребуется
        if (message.author.guild_permissions.administrator
                and message.content.startswith(
                    config.command_temperature_change.lower())):
            mc_splitted = message.content.split()
            if len(mc_splitted) > 1:
                set_ = self.set_temperature(mc_splitted[1])
                if set_:
                    await message.channel.send(
                        f"`temperature` ➡️ `{mc_splitted[1]}`")
                    return True
        return False

    def set_temperature(self, value):
        try:
            temperature = float(value)
        except ValueError:
            return False
        if temperature <= 0:
            return False
        self.temperature = temperature
        return True

    async def on_message(self, message):
        await self.wait_until_ready()
        if (not isinstance(message.channel, discord.TextChannel)
                or (message.author.bot and message.author != self.user)
                or message.type != discord.MessageType.default):
            return
        if not message.channel.permissions_for(message.guild.me).send_messages:
            return
        if message.channel.id not in self.channel_deques:
            self.channel_deques[message.channel.id] = collections.deque(
                maxlen=config.deque_max_len)
        self.channel_deques[message.channel.id].append(message)
        command_used = await self.handle_command(message)
        if command_used:
            return
        if message.author == self.user:
            return
        my_mention = self.user in message.mentions
        if self.decision(config.no_mention_prob) or (
                my_mention and self.decision(config.mention_prob)):
            async with message.channel.typing():
                input_messages = self.channel_deques[message.channel.id]
                input_tensor = self.tokenizer.encode_input(
                    input_messages, self.user)
                output_tensor = predictor.decode_sequence(
                    input_tensor, self.temperature)
                output_message, token_count = self.tokenizer.decode_output(
                    self, input_messages, output_tensor)
                if config.use_delay:
                    await asyncio.sleep(random.uniform(0.1, 0.2) * token_count)
                if output_message:
                    await message.channel.send(output_message[:2000])

Beispiel #11

0

Datei anzeigen

    action='store_true',
    help='Se passado, imprime a árvore gerada (caso não ocorram erros no html)'
)
parser.add_argument('-h',
                    '--help',
                    action='help',
                    help='Mostra essa mensagem e sai.')

args = parser.parse_args()

if args.f:
    for file in args.f:
        verbose = False

        if args.verbose:
            verbose = True

        if not os.path.isfile(file):
            print('Arquivo ' + str(file) + ' não encontrado.')
        html_content = open(file, 'r').read()

        tokenizer = Tokenizer()
        tokenizer.feed(html_content)
        tokens = tokenizer.get_fila()

        arvore = Arvore(tokens=list(tokens), verbose=verbose)

else:
    print(
        'Você precisa de fornecer os arquivos para fazer a análisa sintática')