Beispiel #1
0
    def endElement(self, tag):
        """
        Signals the end of an element in non-namespace mode.
        """
        if tag == "title":
            # initialize a new document with title ==self.title
            self.tokenizer = Tokenizer(self.title)
            self.tokenizer.set_title(self.title)

        elif tag == "text":
            # By now the document title and id fields must have been extracted
            Helpers.docid_docname_map[
                self.tokenizer.get_doc_id()] = self.tokenizer.get_title()
            # add text body to that document    # TODO: use append
            termid_freq_map = self.tokenizer.tokenize(self.text)

            # print("term_termid_map", Helpers.term_termid_map)
            for term in termid_freq_map:
                # accumulate (termid: docid) pairs
                Indexer.termid_docid_list.append(
                    (term, self.tokenizer.get_doc_id()))

        elif tag == "id" and not self.insideRevision:
            # DoNOT set id if inside <revision> <id>XXX</id>
            self.tokenizer.set_doc_id(self.id)

        elif tag == "revision":
            self.insideRevision = False  # </revision> encountered

        self.tag = None
Beispiel #2
0
 def __init__(self, text):
     """
     :param text: client input
     """
     self.text = text
     self.tokenizer = Tokenizer(self.text)
     self.parser = Parser(self.tokenizer.create_tokens())
     self.GLOBAL_VARS = dict()
def process(filename):
    with open(filename) as file:
        text = file.read()
        if not text:
            raise Exception("Cannot read text from file")
        tokenizer = Tokenizer(text)
        parser = Parser(tokenizer.create_tokens())
        tree = parser.parse()
        symbol_table_builder = SemanticAnalyzer()
        symbol_table_builder.visit(tree)
Beispiel #4
0
    def test_returns_tokens_until_exhausted(self):
        expected = [
            Token('class', 'keyword', 1, 1),
            Token('CorrectSyntax', 'identifier', 1, 7),
            Token('{', 'symbol', 1, 21),
            Token('field', 'keyword', 2, 3),
            Token('String', 'identifier', 2, 9),
            Token('bar', 'identifier', 2, 16),
            Token(';', 'symbol', 2, 19),
            Token('constructor', 'keyword', 4, 3),
            Token('CorrectSyntax', 'identifier', 4, 15),
            Token('new', 'identifier', 4, 29),
            Token('(', 'symbol', 4, 32),
            Token(')', 'symbol', 4, 33),
            Token('{', 'symbol', 4, 35),
            Token('let', 'keyword', 5, 5),
            Token('bar', 'identifier', 5, 9),
            Token('=', 'symbol', 5, 13),
            Token('Hello world!', 'stringConst', 5, 15),
            Token(';', 'symbol', 5, 29),
            Token('return', 'keyword', 6, 5),
            Token('this', 'keyword', 6, 12),
            Token(';', 'symbol', 6, 16),
            Token('}', 'symbol', 7, 3),
            Token('method', 'keyword', 9, 3),
            Token('void', 'keyword', 9, 10),
            Token('greetings', 'identifier', 9, 15),
            Token('(', 'symbol', 9, 24),
            Token(')', 'symbol', 9, 25),
            Token('{', 'symbol', 9, 27),
            Token('do', 'keyword', 10, 5),
            Token('Output', 'identifier', 10, 8),
            Token('.', 'symbol', 10, 14),
            Token('printString', 'identifier', 10, 15),
            Token('(', 'symbol', 10, 26),
            Token('bar', 'identifier', 10, 27),
            Token(')', 'symbol', 10, 30),
            Token(';', 'symbol', 10, 31),
            Token('return', 'keyword', 11, 5),
            Token(';', 'symbol', 11, 11),
            Token('}', 'symbol', 12, 3),
            Token('}', 'symbol', 13, 1)
        ]

        filename = TEST_FILES / 'CorrectSyntax.jack'
        tokenizer = Tokenizer(filename)
        tokens = []
        while not tokenizer.finished():
            token = tokenizer.next()
            if not token:
                break

            tokens.append(token)

        self.assertEqual(tokens, expected)
    def __init__(self, trec_dir, stopword_file):
        parser = Parser()
        for filename in os.listdir(trec_dir):
            try:
                with open(os.path.join(trec_dir, filename), 'r') as f:
                    parser.load(f.read())
            except Exception as e:
                pass
        self.docs = parser.docs

        with open(stopword_file, 'r') as f:
            stopwords = f.readlines()
        self.tokenizer = Tokenizer(stopwords)
Beispiel #6
0
class DictionaryBuilder(BaseBuilder):
    def __init__(self):
        super().__init__()
        self.dictionary = set()
        self.count = 0
        self.tokenizer = Tokenizer()

    def run(self, input_dir_path, output_path):
        self.load_files(input_dir_path)
        self.build()
        self.save(output_path)
        self.print_counts()

    def build(self):
        for file in self.files:
            with open(file, 'r') as f:
                lines = f.readlines()
                words = self.tokenizer.format_data(lines)
                self.count += len(words)
                self.dictionary.update(words)

    def save(self, output_path):
        with open(output_path, 'w') as f:
            f.write('\n'.join(sorted(self.dictionary)))

    def print_counts(self):
        print('Total count: ', self.count)
        print('Dictionary count: ', len(self.dictionary))
Beispiel #7
0
    def __init__(self,
                 tokenizer_path: str,
                 model: UnifiedTransformer,
                 max_turns: int = 64,
                 device: Optional[str] = None):

        self.tokenizer = Tokenizer(tokenizer_path=tokenizer_path)

        self.batch_preparer = BatchPreparing(
            sep_index=self.tokenizer.sep_index,
            context_index=self.tokenizer.context_index,
            pad_index=self.tokenizer.pad_index)

        self.model = model

        self.max_turns = max_turns

        if device is None:
            self.device = next(self.model.parameters()).device
        else:
            self.device = torch.device(device)
            self.model.to(self.device)

        self.model.eval()
        self.model.set_seq2seq()
 def _search_word(self, word) -> Set[int]:
     term_id = self.term_map.get(Tokenizer.normalize(word))
     if term_id is None:
         return set()
     term_info = self.term_dic[term_id]
     ri_offset = term_info[2]
     return set(self.r_indices[ri_offset])
Beispiel #9
0
def main(args):
    set_seed(args.seed)
    Path(args.model_dir).mkdir(parents=True, exist_ok=True)

    tk = Tokenizer(args.tokenizer)

    model = TransformerModel(d_model=32,
                             d_ff=64,
                             dropout=.0,
                             layers=3,
                             heads=4,
                             d_emb=-1,
                             pad_token_id=tk.pad_id,
                             vocab_size=tk.vocab_size)

    ds = Arithmetic(args.data)

    print(
        f'model size = {sum(p.numel() for p in model.parameters() if p.requires_grad)/1024/1024:.2f} M trainable parameters'
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train(model=model,
          dataset=ds,
          batch_size=args.batch_size,
          device=device,
          tokenizer=tk,
          epochs=args.epochs,
          model_dir=args.model_dir,
          save_epoch=args.save_epoch,
          summary_step=args.summary_step,
          lr=args.lr)
def compile_all_jack_files(jack_files):
    try:
        os.mkdir("xml-export")
    except:
        pass

    for jack_file in jack_files:
        with open(jack_file) as file:
            code = remove_all_comments(file.read())
            tokenizer = Tokenizer(code)
            tokenizer.export_xml(jack_file.replace(".jack", ""))
            try:
                parser = Parser(tokenizer)
                parser.export_xml(jack_file.replace(".jack", ""))
            except Exception as e:
                print(e)
                exit()
Beispiel #11
0
 def parse(self):
     """
     Replace sentiment with 2: Positive, 1: Neutral, 0: Negative
     """
     tk = Tokenizer(preserve_case=False)
     with open(self.trainingPath) as training:
         tsvRead = csv.reader(training, delimiter="\t")
         enum = {'positive': 2, 'neutral': 1, 'negative': 0, 'unknown':3}
         tweet_dict = {}
         for line in tsvRead:
             if tk.tokenize(line[1]):
                 phrase = tk.tokenize(line[1])
                 for i,word in enumerate(phrase):
                     if i>50 and word in ["neutral","positive","negative","unknown"]:
                         phrase = phrase[:i]
                         break
                 self.data.append({'Sentiment' : enum[line[0]], 'Tweet' : phrase})
Beispiel #12
0
    def test_skips_rest_of_line_after_line_comment(self):
        expected = [
            Token('let', 'keyword', 1, 1),
            Token('foo', 'identifier', 1, 5),
            Token('=', 'symbol', 1, 9),
            Token('5', 'intConst', 1, 11),
            Token(';', 'symbol', 1, 12)
        ]
        filename = TEST_FILES / 'IgnoreLineComment.jack'
        tokenizer = Tokenizer(filename)
        tokens = []
        while not tokenizer.finished():
            token = tokenizer.next()
            if not token:
                break

            tokens.append(token)

        self.assertEqual(tokens, expected)
Beispiel #13
0
    def test_skips_everything_inbetween_multiline_comment(self):
        expected = [
            Token('let', 'keyword', 5, 5),
            Token('foo', 'identifier', 5, 9),
            Token('=', 'symbol', 5, 13),
            Token('5', 'intConst', 5, 15),
            Token(';', 'symbol', 5, 16)
        ]
        filename = TEST_FILES / 'IgnoreMultilineComment.jack'
        tokenizer = Tokenizer(filename)
        tokens = []
        while not tokenizer.finished():
            token = tokenizer.next()
            if not token:
                break

            tokens.append(token)

        self.assertEqual(tokens, expected)
Beispiel #14
0
def build_index():
    global Index, Header
    tokenizer = Tokenizer()
    for subdir in os.listdir(config.RAW_WEBPAGES):
        full_subdir = os.path.join(config.RAW_WEBPAGES, subdir)
        if os.path.isdir(full_subdir):
            to_parse = read_directory(full_subdir)
            print("Subdirectory: ", subdir)
            for _file in tqdm(to_parse):
                filename = "/".join(_file.split("/")[1:])
                header, txt = parse(_file)

                Header[filename] = header
                token_counter = tokenizer.counter_tokenize(txt)
                for tok in token_counter:
                    if tok not in Index:
                        Index[tok] = {filename: token_counter[tok]}
                    else:
                        Index[tok][filename] = token_counter[tok]
    save_index()
    save_header()
Beispiel #15
0
class IdentityMatrixBuilder(BaseBuilder):
    def __init__(self, dict_path):
        super().__init__()
        self.dict_path = dict_path
        self.dictionary = set()
        self.matrix = {}
        self.tokenizer = Tokenizer()

    def run(self, input_dir_path, output_path):
        self.load_dictionary()
        self.load_files(input_dir_path)
        self.init_matrix()
        self.build()
        self.save(output_path)

    def load_dictionary(self):
        with open(self.dict_path, 'r') as f:
            lines = self.tokenizer.filter_new_lines(f.readlines())
            self.dictionary = sorted(lines)

    def init_matrix(self):
        self.matrix = {x: [0] * len(self.files) for x in self.dictionary}

    def build(self):
        for i, file in enumerate(self.files):
            with open(file, 'r') as f:
                lines = f.readlines()
                words = self.tokenizer.format_data(lines)
                for word in words:
                    self.matrix[word][i] += 1

    def save(self, output_path):
        field_names = ['Token'] + list(
            map(lambda x: x.split('/')[-1], self.files))
        with open(output_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(field_names)

            for k, v in self.matrix.items():
                writer.writerow([k] + v)
Beispiel #16
0
def run(src: str) -> None:
    global env

    #tokenization
    tkz = Tokenizer()
    tokens, err = tkz.tokenize(src)

    if tok_debug:
        for i in tokens:
            print(i)

    if display_errors(err, "LOX: SYNTAX ERROR"):
        return

    #don't send single EOF token to parser
    #this allows parser to make stricter assertions while generating the AST
    if tokens[0].type == TokenType.EOF:
        return

    #parsing
    prs = Parser()
    program, err = prs.parse(tokens)

    if parse_debug:
        for tree in program:
            print(tree)

    if display_errors(err, "LOX: GRAMMAR ERROR"):
        return

    #interpretation
    itr = Interpreter(env)
    exit_status, err, env = itr.interpret(program)
    display_errors(err, "LOX: RUNTIME ERROR")

    if env_debug:
        print(env.map)
Beispiel #17
0
def main():
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument("prop", help="Propositional statement")
    arg_parser.add_argument("--tokens", "-t", action="store_true")
    arg_parser.add_argument("--ast", "-a", action="store_true")
    arg_parser.add_argument("--symbols", "-s", action="store_true")
    arg_parser.add_argument("--truthtable", "-tt", nargs='?', const=1, type=int)
    args = arg_parser.parse_args()

    if not args.prop:
        args.prop = " "

    error_collector = ErrorCollector(args.prop)
    symbol_table = SymbolTable()

    tokenizer = Tokenizer(error_collector, symbol_table)

    parser = Parser(tokenizer, error_collector)
    ast = parser.parse()
    
    # Para hacer debug jeje
    if args.symbols:
        symbol_table.show_symbols()

    if error_collector.has_errors():
        error_collector.show_errors()
        return

    if args.tokens:
        tokenizer.show_tokens()

    if args.ast:
        print(ast)
        
    if args.truthtable:
        tt = TruthTable(ast, symbol_table, args.truthtable)
        tt.show()
Beispiel #18
0
    def test_throws_if_next_invoked_while_status_is_FINISHED(self):
        mock_func = Mock()
        tokenizer = Tokenizer('')
        tokenizer._generator = mock_func
        tokenizer._status = Tokenizer.EStatus.FINISHED
        with self.assertRaisesRegex(TokenizerError, '(f|F)inished'):
            tokenizer.next()

        self.assertFalse(mock_func.called)
class TermProvider(object):
    def __init__(self, trec_dir, stopword_file):
        parser = Parser()
        for filename in os.listdir(trec_dir):
            try:
                with open(os.path.join(trec_dir, filename), 'r') as f:
                    parser.load(f.read())
            except Exception as e:
                pass
        self.docs = parser.docs

        with open(stopword_file, 'r') as f:
            stopwords = f.readlines()
        self.tokenizer = Tokenizer(stopwords)

    def __iter__(self):
        return self.tokenizer.iter_terms(self.docs)
Beispiel #20
0
    def test_throws_if_string_constant_has_missing_end_double_quote(self):
        filename = TEST_FILES / 'MalformedString.jack'
        tokenizer = Tokenizer(filename)
        line = 5
        character = 15
        for i in range(16):
            tokenizer.next()

        with self.assertRaisesRegex(
                TokenizerError,
                f'(L|l)ine.*{line}.*(C|c)haracter.*{character}'):
            tokenizer.next()
Beispiel #21
0
    def test_throws_if_string_constant_has_unescaped_double_quote_within(self):
        filename = TEST_FILES / 'UnescapedDoubleQuotesInString.jack'
        tokenizer = Tokenizer(filename)
        line = 5
        character = 15
        for i in range(16):
            tokenizer.next()

        with self.assertRaisesRegex(
                TokenizerError,
                f'(L|l)ine.*{line}.*(C|c)haracter.*{character}'):
            tokenizer.next()
    def run(self):

        if self.config.download:
            logger.info('Download')
            collector.download()

        if self.config.train_bpe:
            logger.info('Train BPE')
            collector.train_bpe()

        self.tokenizer = Tokenizer(tokenizer_path=self.bpe_model_path,
                                   need_bos=True,
                                   need_eos=True,
                                   sep_token=self.config.sep_token,
                                   context_token=self.config.context_token)

        if self.config.collect_data:
            logger.info('Parse data')
            self.make_dir(self.train_dir, override=True)
            self.make_dir(self.validation_dir, override=True)
            collector.collect()
Beispiel #23
0
def main(args):
    set_seed(args.seed)
    Path(args.model_dir).mkdir(parents=True, exist_ok=True)

    tk = Tokenizer(args.tokenizer)

    model = TransformerModel(
        d_model=768,
        d_ff=1024,
        dropout=args.dropout,
        layers=args.layer,
        heads=args.heads,
        d_emb=-1,
        pad_token_id=tk.pad_id,
        vocab_size=tk.vocab_size
    )

    ds = NewsDataset(args.data, args.alpha, args.beta, inplace=args.inplace,
                     sample=args.sample, seed=args.seed)

    print(
        f'model size = {sum(p.numel() for p in model.parameters() if p.requires_grad)/1024/1024:.2f} M trainable parameters')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train(
        model=model,
        dataset=ds,
        batch_size=args.batch_size,
        device=device,
        tokenizer=tk,
        epochs=args.epochs,
        model_dir=args.model_dir,
        save_epoch=args.save_epoch,
        summary_step=args.summary_step,
        lr=args.lr,
        warnup=args.warnup)

    torch.cuda.empty_cache()
Beispiel #24
0
def main(args):
    set_seed(args.seed)

    tk = Tokenizer(args.tokenizer)

    ds = Arithmetic(args.data)

    dl = torch.utils.data.DataLoader(dataset=ds,
                                     batch_size=args.batch_size,
                                     shuffle=False,
                                     collate_fn=ds.get_collate_fn(tk))

    model = TransformerModel(d_model=32,
                             d_ff=64,
                             dropout=.0,
                             layers=3,
                             heads=4,
                             d_emb=-1,
                             pad_token_id=tk.pad_id,
                             vocab_size=tk.vocab_size)

    device = torch.device(args.device)

    model.load_state_dict(torch.load(args.ckpt, map_location=device)['model'])
    model.to(device)

    start = timeit.default_timer()
    total = 0
    tp = 0
    for x, y in tqdm(dl):
        p = beam_search_v2(model, x, tk, lambda b, nx, ny:
                           (nx + ny) * b > 4096 * 6 * 64, 1, device, 10)
        r = list(map(lambda i: i[0] == i[1], zip(p, y)))
        total += len(r)
        tp += np.count_nonzero(r)

    print((timeit.default_timer() - start))
    print(tp / total)
Beispiel #25
0
class TestTokenizer(unittest.TestCase):
    """Tests The utility class methods"""
    def setUp(self):
        self.text_input = [
            'Hi! , How are you?', 'How everything is going?', 'how old are you'
        ]
        self.tokenizer = Tokenizer()

    def test_tokenizer_dictionary_contains_special_tokens(self):
        """test that creation creation,
         dictionary contains the <sos>, <eos> <pad> and <unk> tokens"""
        expected_num_dictionary_items = len(
            ['<sos>', '<eos>', '<pad>', '<unk>'])
        self.assertEqual(expected_num_dictionary_items,
                         self.tokenizer.dictionary_size)

    def test_text_to_number_works_without_trimming(self):
        """tests the methods properly splits the input sequence."""
        self.tokenizer.fit_on_text(self.text_input, min_keep_frequency=0)
        expected_dictionary_size = 14
        self.assertEqual(expected_dictionary_size,
                         self.tokenizer.dictionary_size)

    def test_text_to_number_works_with_trimming(self):
        """tests the methods properly splits the input sequence."""
        self.tokenizer.fit_on_text(self.text_input, min_keep_frequency=3)
        expected_dictionary_size = 5
        self.assertEqual(expected_dictionary_size,
                         self.tokenizer.dictionary_size)

    def test_text_to_numbers(self):
        """tests tokenizer converts text into numbers"""
        input_text = ['how are you?']
        self.tokenizer.fit_on_text(input_text)
        text_indexes = self.tokenizer.convert_text_to_number(input_text)
        expected = [[4, 5, 6, 7, 2]]
        comparison = expected == text_indexes
        self.assertTrue(all(comparison[0]))

    def test_numbers_to_text(self):
        """tests tokenizer converts text into numbers"""
        input_text = ['pytorch is awesome']
        self.tokenizer.fit_on_text(input_text)
        text = self.tokenizer.convert_number_to_text([4, 5, 6, 2])
        expected = input_text[0]
        self.assertEqual(expected, text)

    def test_filter_is_filtering_long_sentences(self):
        """testes the filter function removes the long token
         jointly together from both sources and targets"""
        source_numbers = [[1, 4], [4, 5, 6], [9]]
        target_numbers = [[11, 22, 33, 44], [44, 55], [88, 99, 100, 110]]

        filtered_sources, filtered_targets = self.tokenizer.filter(
            source_numbers,
            target_numbers,
            max_token_size=3,
            remove_unknown=False)
        expected_source = [[4, 5, 6]]
        expected_targets = [[44, 55]]
        self.assertListEqual(expected_source[0], filtered_sources[0])
        self.assertEqual(expected_targets[0], filtered_targets[0])

    def test_filter_removes_token_containing_unknown_token_index(self):
        """testes the filter function removes with unknown tokens
        """
        unknown_index = self.tokenizer.unknown_index
        source_numbers = [[1, unknown_index], [4, 5], [9]]
        target_numbers = [[11, 22, 33], [44, unknown_index], [88, 99, 100]]

        filtered_sources, filtered_targets = self.tokenizer.filter(
            source_numbers,
            target_numbers,
            max_token_size=3,
            remove_unknown=True)
        expected_source = [[9]]
        expected_targets = [[88, 99, 100]]
        self.assertListEqual(expected_source[0], filtered_sources[0])
        self.assertEqual(expected_targets[0], filtered_targets[0])
 def __init__(self):
     super().__init__()
     self.tokenizer = Tokenizer()
     self.index = defaultdict(list)
Beispiel #27
0
        lambda x: (x[0], float(x[1])),
        map(lambda x: x.strip().split('\t'),
            filter(len,
                   open('data/t2.1.vocab').readlines()))))

w, f = zip(*d)
f = np.array(f)
print(np.exp(f[39:]).sum())

l = list(map(len, w[39:]))
freq = sorted(dict(Counter(l)).items(), key=lambda x: x[0])
print('\n'.join(f'|{i}|{j}|' for i, j in freq))
# In[]
ds = NewsDataset('data/news_dataset_tag10_v2.1.db')
# ds = NewsDataset('data/wiki.db')
tk = Tokenizer('data/t2.1_c1')

# In[]
from src.utils import peek
d = peek(ds.data, 1)
print(d[0][2])
print(tk.detokenize(tk.tokenize(d[0][2])))
# In[]
ll = list(map(lambda x: len(x[2]), ds.data))
sl = sorted(ll)
print(sl[0])
print(sl[int(len(sl) * 0.25)])
print(sl[int(len(sl) * 0.5)])
print(sl[int(len(sl) * 0.75)])
print(sl[-1])
Beispiel #28
0
def parse(code):
    return Parser(Tokenizer(code).run()).run()
    arg_parser.add_argument('--prefix', type=str, default='char_level_gru')
    arg_parser.add_argument('--metric_name', type=str, default='val_ppl')
    arg_parser.add_argument('--device', choices=['cuda', 'cpu'], default='cpu')
    arg_parser.add_argument('--seed', type=int, default=42)
    args = arg_parser.parse_args()

    seed_all(args.seed)
    print('########################################')
    print('Load data')
    train_data = WordsDataset._read_data(args.train_path)
    valid_data = WordsDataset._read_data(args.valid_path)

    print(f'Train size: {len(train_data)}\nValid size: {len(valid_data)}')
    print('########################################')

    tokenizer = Tokenizer(train_data)

    print('########################################')
    print('Build datasets')
    train_dataset = WordsDataset(train_data,
                                 tokenizer,
                                 max_length=args.max_length)
    valid_dataset = WordsDataset(valid_data,
                                 tokenizer,
                                 max_length=args.max_length)

    print('########################################')
    print('Build dataloaders')
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)
Beispiel #30
0
 def setUp(self):
     self.text_input = [
         'Hi! , How are you?', 'How everything is going?', 'how old are you'
     ]
     self.tokenizer = Tokenizer()
Beispiel #31
0
def tokenizeUserInput(filmReviewList):
    print("[2] Tokenization of the Token Review Started")
    tokenList = Tokenizer.doTokenization(filmReviewList)
    print("[2] Tokenization of the Review comments completed. Tokens: "+str(tokenList))
    return tokenList