class PhraseTokenizer(PhraseSplitter): def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() def tokenize(self, phrase): return self.tokenizer.tokenize(phrase)
def test_with_invalid_token(self): tokenizer = Tokenizer() try: tokenizer.get_data_by_token('invalid_token') raise Exception("Invalid token can't get an origin data") except InvalidTokenError: pass
class PhraseCleaner: def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() def process(self, phrase): return u' '.join(self.tokenizer.tokenize(phrase))
def create_indexer(self): #get the list of ulrs to be removed with open('database/removed_urls.pkl', 'rb') as f: removed_url = pickle.load(f) #hash_doc file stores mapping of doc_id and url hash_doc = open(self.doc_file, "w+") for dir in self.path_to_db.iterdir(): if dir.is_dir(): for file in dir.iterdir(): if not file.is_file(): continue with open(file, 'r', encoding="ascii", errors="ignore") as file: parsed_json = json.load(file) url = parsed_json['url'] if url in removed_url: continue url = self.removeFragment(url) content = parsed_json['content'] tokenizer = Tokenizer(content, self.ngram) token_tf = tokenizer.extract_texts() hash_doc.write("%d, %s, %d\n" % (self.count_files, url, tokenizer.length)) self.add_tokens_to_dictionary(token_tf, self.count_files) self.count_files += 1 hash_doc.close() self.save_to_file() self.recalculate_tf_idf()
def __init__(self, parent=None): super().__init__(parent=parent) # self.setFont(QFont("default", 9)) self.setWindowOpacity(1) self.setWindowIcon(QIcon("assets/calculator.png")) self.setWindowTitle("Calculator") sci = CalculatorView(ctype=GeneralCalcView._scientific) vec1 = CalculatorView(ctype=GeneralCalcView._vector1d) vec2 = CalculatorView(ctype=GeneralCalcView._vector2d) sci_controller = CalculatorController( sci, InputController(GeneralCalcView._scientific), Tokenizer("real")) vec1_controller = CalculatorController( vec1, InputController(GeneralCalcView._vector1d), Tokenizer("vec1")) vec2_controller = CalculatorController( vec2, InputController(GeneralCalcView._vector2d), Tokenizer("vec2")) self.addTab(sci, QIcon("assets/calculator.png"), GeneralCalcView._scientific) self.addTab(vec1, QIcon("assets/vector1d.png"), GeneralCalcView._vector1d) self.addTab(vec2, QIcon("assets/vector2d.png"), GeneralCalcView._vector2d)
def __init__(self, img_width, img_height, n_chars=7, chars=None, labels_path='/path/to/the/annotated/file', root_img_dir='/path/to/img/dir'): self.n_chars = n_chars if chars is None: self.chars = list( '1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' ) else: self.chars = list(chars) self.tokenizer = Tokenizer(self.chars) df = pd.read_csv(labels_path, dtype={'img_id': str}) self.annotaded_data = df.loc[df['text'] != 'no_one'] self.root_img_dir = root_img_dir self.img_trans = transforms.Compose([ transforms.Resize((img_height, img_width)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])
def tokenize_raw_text(save_dir): text_save_dir = os.path.join(save_dir, 'text_files') numpy_vectors_save_dir = os.path.join(save_dir, 'numpy_vectors') remove_folder(numpy_vectors_save_dir) make_folder(numpy_vectors_save_dir) hadms = [] for filename in os.listdir(text_save_dir): if ".txt" in filename: hadm = filename.replace(".txt", "") hadms.append(hadm) log(f"Total number of text files in set: {len(hadms)}") log(f'Loading vocab dict saved during from {VOCAB_DICT_PATH}') with open(VOCAB_DICT_PATH, 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab) for hadm in tqdm.tqdm(hadms, desc='Tokenizing raw patient notes'): text = open(os.path.join(text_save_dir, str(hadm) + ".txt"), "r").read() words = tokenizer.process(text) vector = [] for word in words: if word in vocab: vector.append(vocab[word]) elif tokenizer.only_numerals(word) and ( len(vector) == 0 or vector[-1] != vocab["<NUM>"]): vector.append(vocab["<NUM>"]) mat = np.array(vector) # saving word indices to file write_file = os.path.join(numpy_vectors_save_dir, f"{hadm}.npy") np.save(write_file, mat)
def generateYelpSentenceExample(filename): tok = Tokenizer(preserve_case=False) # extracting tokens for line in data.generateLine(filename): review = json.loads(line) tokens = tok.sentence_tokenize(review['text']) stars = int(review['stars']) yield tokens, stars
def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lemmatizer = Mystem() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.gg_dictionaries = GenerativeGrammarDictionaries() self.known_words = set()
def __init__(self, reduce_mode="gmean", device="cuda"): if device == "cpu": logger.warning("Running LMScorer on CPU. Scoring may be slow.") self.model = LMScorer.from_pretrained("gpt2", device=device, batch_size=1) self.reduce_mode = reduce_mode self.tokenizer = Tokenizer()
def test_tokenizer(self): tokenizer = Tokenizer() origin_data = { 'some_key': 'some_value', 'additional_key': 'additional_value' } token = tokenizer.get_token_by_data(origin_data) self.assertIsNotNone(token) data_from_token = tokenizer.get_data_by_token(token) self.assertEqual(origin_data, data_from_token)
class PhraseLemmatizer(PhraseSplitter): def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lemmatizer = Mystem() def tokenize(self, phrase): words = self.tokenizer.tokenize(phrase) wx = u' '.join(words) return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0]
def test_expired_token(self): tokenizer = Tokenizer() origin_data = { 'some_key': 'some_value', 'additional_key': 'additional_value' } token = tokenizer.get_token_by_data(origin_data, datetime(2018, 1, 1)) try: tokenizer.get_data_by_token(token, datetime(2018, 1, 8)) raise Exception('token must be expired') except TokenExpiredError: pass
class PhraseStemmer(PhraseSplitter): def __init__(self): self.tokenizer = Tokenizer() self.stemmer = RussianStemmer() def tokenize(self, phrase): words = self.tokenizer.tokenize(phrase) wx = u' '.join(words) return [ self.stemmer.stem(w) for w in self.tokenizer.tokenize(phrase) if len(w.strip()) > 0 ]
def set_custom_word(self, path): self.check_detector_initialized() word_freqs = self.load_word_freq_dict(path) # 合并字典 self.custom_word_freq.update(word_freqs) # 合并切词词典及自定义词典 self.word_freq.update(self.custom_word_freq) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) for k, v in word_freqs.items(): self.set_word_frequency(k, v) logger.debug('Loaded custom word path: %s, size: %d' % (path, len(word_freqs)))
def __init__(self, img_width, img_height, ds_size, n_chars=4, chars=None): self.gen = ImageCaptcha(img_width, img_height) self.size = ds_size self.n_chars = n_chars if chars is None: self.chars = list('1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') else: self.chars = list(chars) self.tokenizer = Tokenizer(self.chars) self.first_run = True
def setUp(self): self.tokenizer = Tokenizer({ 'instruction_names': ['MOV', 'JMP'], 'macro_names': ['DAT'], 'word_registers': WORD_REGISTERS, 'byte_registers': BYTE_REGISTERS, })
class SentenceScorer: def __init__(self, reduce_mode="gmean", device="cuda"): if device == "cpu": logger.warning("Running LMScorer on CPU. Scoring may be slow.") self.model = LMScorer.from_pretrained("gpt2", device=device, batch_size=1) self.reduce_mode = reduce_mode self.tokenizer = Tokenizer() def score(self, sentence): sentence = self.tokenizer.detokenize(sentence) return self.model.sentence_score(sentence, reduce=self.reduce_mode, log=True) def select_best(self, sentences): scores = [] for sent in sentences: sent_score = self.score(sent) scores.append((sent, sent_score)) scores.sort(key=lambda x: x[1], reverse=True) # pp(scores) return scores[0][0]
def predict(sentences, config_file, ner_model_list, pretrain_model_file, dense_layer_model_file, re_model_file, vocab_file=None, device='cpu'): cfg = Config() cfg.load_config(config_file) vocab_file = cfg.config['vocab'] if vocab_file is None else vocab_file vocab = load_vocab(vocab_file) tokenizer = Tokenizer(vocab) pretrian_checkpoint = torch.load(pretrain_model_file, map_location=device) ner_label2id = {'B': 0, 'I': 1, 'O': 2, 'X': 3, '[start]': 4, '[end]': 5} re_label2id = { "NA": 0, "gene_associated_with_disease": 1, "disease_associated_with_tissue": 2, "disease_associated_with_disease": 3, "tissue_associated_with_tissue": 4 } res = ner_predict(sentences, ner_label2id, cfg, ner_model_list, pretrian_checkpoint, vocab, tokenizer, device) entitys = [item['entity'] for item in res] for idx, sent in enumerate(sentences): relations = re_predict(sent, [entitys[idx]], re_label2id, cfg, pretrian_checkpoint, dense_layer_model_file, re_model_file, vocab, tokenizer, device) res[idx]['relation'] = relations return res
def initialize_detector_dict(self): t1 = time.time() self.confusions = dict() self.spec_nouns = self.load_dict(self.spec_nouns_path) self.gangtai = self.load_dict(self.gangtai_path) self.common_confusion = self.load_dict(self.common_confusion_path) self.confusions.update(self.spec_nouns) self.confusions.update(self.gangtai) self.confusions.update(self.common_confusion) self.confusions_words = list(self.confusions.keys()) confusions_values = list(self.confusions.values()) self.confusions_words.extend(confusions_values) # 词、频数dict self.word_freq = self.load_word_freq_dict(self.word_freq_path) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_confusion_dict=self.confusions, custom_word_freq_dict=self.custom_word_freq) logger.debug('Loaded file: %s, size: %d, spend: %s s' % (self.spec_nouns_path, len( self.confusions), str(time.time() - t1))) self.initialized_detector_dict = True
def _initialize_detector(self): t1 = time.time() try: import kenlm except ImportError: raise ImportError( 'pycorrector dependencies are not fully installed, ' 'they are required for statistical language model.' 'Please use "pip install kenlm" to install it.' 'if you are Win, Please install kenlm in cgwin.') if not os.path.exists(self.language_model_path): filename = self.pre_trained_language_models.get( self.language_model_path, 'zh_giga.no_cna_cmn.prune01244.klm') url = self.pre_trained_language_models.get(filename) get_file(filename, url, extract=True, cache_dir=config.USER_DIR, cache_subdir=config.USER_DATA_DIR, verbose=1) self.lm = kenlm.Model(self.language_model_path) t2 = time.time() logger.debug('Loaded language model: %s, spend: %.3f s.' % (self.language_model_path, t2 - t1)) # 词、频数dict self.word_freq = self.load_word_freq_dict(self.word_freq_path) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) t3 = time.time() logger.debug('Loaded dict file, spend: %.3f s.' % (t3 - t2)) self.initialized_detector = True
def __init__(self, data_root, split='train', vocab_json=None): assert split in ['train', 'test', 'valid'], 'Invalid split' self.data_root = data_root self.df = pd.read_csv(os.path.join(self.data_root, 'data/', '{}_data.csv'.format(split))) self.lang = Tokenizer() if vocab_json is None: self.lang.add_words(self.df['action']) self.lang.add_words(self.df['object']) self.lang.add_words(self.df['location']) self.lang.make_dicts() self.vocab_json = 'word2idx.json' self.lang.export_json(self.vocab_json) else: self.vocab_json = vocab_json self.lang.import_json(vocab_json)
class TextUtils(object): def __init__(self): self.tokenizer = Tokenizer() self.lemmatizer = Mystem() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() def load_dictionaries(self, data_folder): word2lemmas_path = os.path.join(data_folder, 'ru_word2lemma.tsv.gz') self.lexicon.load(word2lemmas_path) def canonize_text(self, s): # Удаляем два и более пробелов подряд, заменяя на один. s = re.sub("(\\s{2,})", ' ', s.strip()) return s def ngrams(self, s, n): return [ u''.join(z) for z in itertools.izip(*[s[i:] for i in range(n)]) ] def words2str(self, words): return u' '.join( itertools.chain([BEG_WORD], filter(lambda z: len(z) > 0, words), [END_WORD])) def tokenize(self, s): return self.tokenizer.tokenize(s) def lemmatize(self, s): words = self.tokenizer.tokenize(s) wx = u' '.join(words) return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0] # Слева добавляем пустые слова def lpad_wordseq(self, words, n): return list( itertools.chain(itertools.repeat(PAD_WORD, n - len(words)), words)) # Справа добавляем пустые слова def rpad_wordseq(self, words, n): return list( itertools.chain(words, itertools.repeat(PAD_WORD, n - len(words)))) def get_lexicon(self): return self.lexicon
class SSIGALPRDataset(Dataset): def __init__(self, img_width, img_height, n_chars=7, chars=None, labels_path='/path/to/the/annotated/file', root_img_dir='/path/to/img/dir'): self.n_chars = n_chars if chars is None: self.chars = list( '1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' ) else: self.chars = list(chars) self.tokenizer = Tokenizer(self.chars) df = pd.read_csv(labels_path, dtype={'img_id': str}) self.annotaded_data = df.loc[df['text'] != 'no_one'] self.root_img_dir = root_img_dir self.img_trans = transforms.Compose([ transforms.Resize((img_height, img_width)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) def __len__(self): return self.annotaded_data.shape[0] def __getitem__(self, item): annotaded_item = self.annotaded_data.iloc[item] img_id = annotaded_item[0] img_path = self.root_img_dir + '/' + img_id + '.png' img = Image.open(img_path) width, height = img.size x0 = annotaded_item[1] * width y0 = annotaded_item[2] * height x1 = annotaded_item[3] * width y1 = annotaded_item[4] * height roi = img.crop((x0, y0, x1, y1)) groundtruth = annotaded_item[5] groundtruth_label = torch.full((self.n_chars + 2, ), self.tokenizer.EOS_token, dtype=torch.long) ts = self.tokenizer.tokenize(groundtruth) groundtruth_label[:ts.shape[0]] = torch.tensor(ts) return self.img_trans(roi), groundtruth_label
def parse_document(document, type='single'): document = document.replace('\r', '').replace('\n', '') doc_num = str(re.search('<DOCNO>(.*?)</DOCNO>', document).groups(1)).replace('(\'', '').replace( '\',)', '').strip() doc_body = str(re.search('<TEXT>(.*?)</TEXT>', document).groups(1)) processed_doc = pre_process(doc_body) tokens, positions = Tokenizer(processed_doc, type).tokenize_text() doc_len = len(tokens) return doc_num, tokens, positions, doc_len
def load_data(seq_length, label): print("Start to load data.") start_time = time.time() # emb: collections.OrderedDict()顺序字典 ## key:词(str类型), value:词向量(list类型,元素为float) # dict_length: 字典大小 # emb_size: 词向量的维数 emb, dict_length, emb_size = get_emb() # 用所有的词(str类型)实例化一个tokenizer tokenizer = Tokenizer(emb.keys()) # emb_matrix: ID与词向量的对应的矩阵 ## ID: 每种字对应一个ID号,比如“的”1号,“是”2号以此类推 ## 矩阵第一维的坐标就是ID号,ID号这一行的向量即对应的词向量 emb_matrix = get_emb_matrix(emb, tokenizer, dict_length, emb_size) # 生成ChnSentiCorp_Clf类的实例 ## 类的构造函数已经将数据切分成训练数据和测试数据 data_loader = Tnews_ChnCorp_Clf(label) # 获取训练数据 ## list类型,以data_example类的实例为元素 ## data_example类包含2个属性:text,str类型;label,str类型 train_examples = data_loader.get_train_examples() # 获取验证数据 ## 同train_examples dev_examples = data_loader.get_dev_examples() def generate_dataloader(examples, tokenizer, seq_length): """ 生成数据加载器 :param examples: list类型,以data_example类的实例为元素。 data_example类包含2个属性:text,str类型;label_id,int类型 :param tokenizer: :param seq_length: 一个样本/序列长度 :return: dataloader,迭代器类型; """ features = multi_convert_example_to_feature(examples, tokenizer, seq_length) # ids,tensor类型(转自list类型) # 每个元素代表一个样本的text文本对应的ID号序列,list类型 # 一个字对应一个ID号 ids = torch.tensor([f.ids for f in features], dtype=torch.long) # labels,tensor类型(转自list类型) # 每个元素是一个样本对应的标签ID号 label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(ids, label_ids) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) return dataloader train_dataloader = generate_dataloader(train_examples, tokenizer, seq_length) dev_dataloader = generate_dataloader(dev_examples, tokenizer, seq_length) end_time = time.time() print("Data loading finishes. Time span: {:.2f}s".format(end_time - start_time)) return emb_matrix, train_dataloader, dev_dataloader, tokenizer
def train(config, device, RS='Supervised'): # Init tokenizer. tokenizer = Tokenizer(config.temp_dir, config.jieba_dict_file, config.remove_stopwords, config.stopwords_file, config.ivr) # Init feature index. feature_index = FeatureIndex(config, tokenizer=tokenizer) file_list = [config.labeled_file] if config.extra_train_file is not None: file_list.append(config.extra_train_file) if config.valid_file is not None: file_list.append(config.valid_file) feature_index.build_index(file_list) # Preprocess data. pre_process = PreProcess(config) train_data_dir, valid_data_dir, final_train_file, final_valid_file = pre_process.train_preprocess( ) # Get PyTorch dataset. train_dataset = MixnetDataset(config, train_data_dir, feature_index, tokenizer) valid_dataset = MixnetDataset(config, valid_data_dir, feature_index, tokenizer, True) # Get NER model if necessary and compatible. need_ner = False for (feature, feature_config) in config.feature_config_dict.items(): need_ner = need_ner or ("text" in feature_config.get( "type", "") and feature_config.get("seg_type", "word") == "char" and feature_config.get("ner", False)) if need_ner: logger.info("Enable NER, loading NER model...") # Use predict mode since we cannot train it without tag information. ner_model = NERModel(device, "predict") else: logger.info("Disable NER.") ner_model = None # Get PyTorch data loader. train_data_loader = DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=config.read_workers) valid_data_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=config.read_workers) # Init model. model = MixNet(config.model_config_dict, config.output_config_dict, feature_index.feature_info_dict, feature_index.label_info_dict, ner_model=ner_model) # Train model. solver = Solver(config, train_data_loader, valid_data_loader, feature_index, model, device, RS) solver.build() solver.train()
def __init__(self): self.tokenizer = Tokenizer() self.data_path = global_config.project_path + '/data/transcriptions' self.vocabularies = {"voc_l_1": [], "voc_l_2": []} self.correct_tokenizing = {"pairs_incorrect": 0, "pairs_correct": 0} data1, data2 = itertools.tee(self.data_loading(), 2) self.create_voc(data1) self.find_s_freq_th(data2) for el in self.correct_tokenizing: print(el, self.correct_tokenizing[el]) for v in self.vocabularies: self.find_w_freq_th(self.vocabularies[v], v)
def chunk_text(text, tokenize=False): splitter = SentenceSplitter() chunker = Chunker() if tokenize: # NOT YET FINISHED tokenized_text = Tokenizer(text).tokenize_text() else: text = text.lower() sentences = splitter.split(text) chunker.chunk(sentences, len(text)) #chunker.pp_chunks() return chunker.get_chunks()
def read_queries(self, static): queries = [] global number, title avg_query = 0 with open(self.opt['query_dir'], 'r') as f: for line in f: if 'num' in line: number = line.split("Number:", 1)[1].strip() elif 'title' in line and not self.opt['threshold']: title = line.split("Topic:", 1)[1].strip() if static: qterms = Tokenizer( title.lower(), self.opt['index_type']).tokenize_text() else: qterms = Tokenizer(title.lower(), 'single').tokenize_text() q = Query(title.lower(), number, [q.lower() for q in qterms[0]]) if '/' in title: q.split_slash() queries.append(q) elif 'narr' in line and self.opt['threshold']: line = f.readline() narrative = '' while '</top>' not in line: narrative += line.replace('\n', '') line = f.readline() qterms = Tokenizer(narrative.lower().strip(), 'single').tokenize_text() avg_query += len(qterms[0]) q = Query(narrative.lower(), number, [q.lower() for q in qterms[0]]) queries.append(q) if (self.opt['threshold']): print('Average Query Length before reduction: {0:.2f}'.format( avg_query / float(len(queries)))) return queries
def load_dataset(params): tokenizer = Tokenizer() tokenizer.load() # Датасет должен быть заранее сформирован скриптом ./preparation/prepare_req_interpretation_classif.py df = pd.read_csv(os.path.join(data_folder, 'req_interpretation_dataset.csv'), sep='\t', encoding='utf-8') samples = [ Sample(row['text'], int(row['label'])) for i, row in df.iterrows() ] # Токенизация сэмплов for sample in samples: sample.words = tokenizer.tokenize(sample.phrase) nb_0 = sum(sample.y == 0 for sample in samples) nb_1 = sum(sample.y == 1 for sample in samples) logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1)) max_wordseq_len = max(len(sample.words) for sample in samples) logging.info('max_wordseq_len={}'.format(max_wordseq_len)) if params['padding'] == 'left': for sample in samples: sample.words = lpad_wordseq(sample.words, max_wordseq_len) else: for sample in samples: sample.words = rpad_wordseq(sample.words, max_wordseq_len) computed_params = { 'max_wordseq_len': max_wordseq_len, 'nb_0': nb_0, 'nb_1': nb_1 } return samples, computed_params
def __init__(self, config, restart, frontier_factory=Frontier, worker_factory=Worker, subdomain_printer_factory=SubDomainPrinter, tokenizer_factory=Tokenizer): self.config = config self.logger = get_logger("CRAWLER") self.frontier = frontier_factory(config, restart) self.workers = list() self.worker_factory = worker_factory self.subdomain_printer = SubDomainPrinter(config, restart) self.tokenizer = Tokenizer(config, restart)
def __init__(self, instruction_set, registers): self.instruction_names = [ inst.__name__ for opcode, inst in instruction_set ] self.instruction_mapping = { inst.__name__: (opcode, inst) for opcode, inst in instruction_set } self.macro_names = ['DAT', 'DATN'] self.word_registers = registers['word'] self.byte_registers = registers['byte'] self.keywords = set(self.instruction_names + self.macro_names + self.word_registers + self.byte_registers) self.tokenizer = Tokenizer({ 'instruction_names': self.instruction_names, 'macro_names': self.macro_names, 'word_registers': self.word_registers, 'byte_registers': self.byte_registers, }) self._reset_state()
class TestTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = Tokenizer({ 'instruction_names': ['MOV', 'JMP'], 'macro_names': ['DAT'], 'word_registers': WORD_REGISTERS, 'byte_registers': BYTE_REGISTERS, }) def test_label_alone(self): tokens = self.tokenizer.tokenize('labelname:') self.assertListEqual(tokens, [ Token(TokenType.LABEL, 'labelname', 0), ]) def test_code_alone(self): tokens = self.tokenizer.tokenize('mov ax 0x0100') self.assertListEqual(tokens, [ Token(TokenType.INSTRUCTION, 'MOV', 0), Token(TokenType.WORD_REGISTER, 'AX', 4), Token(TokenType.WORD_LITERAL, 256, 7), ]) def test_label_code_comment(self): tokens = self.tokenizer.tokenize('labelname: mov ax 0x0100 # comment text') self.assertListEqual(tokens, [ Token(TokenType.LABEL, 'labelname', 0), Token(TokenType.INSTRUCTION, 'MOV', 11), Token(TokenType.WORD_REGISTER, 'AX', 15), Token(TokenType.WORD_LITERAL, 256, 18), Token(TokenType.COMMENT, ' comment text', 26), ]) def test_random_case(self): tokens = self.tokenizer.tokenize('LabeL: mOv aX 0x00fF') self.assertListEqual(tokens, [ Token(TokenType.LABEL, 'LabeL', 0), Token(TokenType.INSTRUCTION, 'MOV', 7), Token(TokenType.WORD_REGISTER, 'AX', 11), Token(TokenType.WORD_LITERAL, 255, 14), ]) def test_random_whitespace(self): tokens = self.tokenizer.tokenize(' mov ax 0x0100 ') self.assertListEqual(tokens, [ Token(TokenType.INSTRUCTION, 'MOV', 3), Token(TokenType.WORD_REGISTER, 'AX', 11), Token(TokenType.WORD_LITERAL, 256, 28), ]) def test_almost_keyword_identifiers(self): tokens = self.tokenizer.tokenize('MOVE AXE ALL BEEF FF') self.assertListEqual(tokens, [ Token(TokenType.IDENTIFIER, 'MOVE', 0), Token(TokenType.IDENTIFIER, 'AXE', 5), Token(TokenType.IDENTIFIER, 'ALL', 9), Token(TokenType.IDENTIFIER, 'BEEF', 13), Token(TokenType.IDENTIFIER, 'FF', 18), ]) def test_every_token_type(self): tokens = self.tokenizer.tokenize('label: other_label: MOV AX AL 0x1234 0x12 ^0x1234 ^label [AX] [AX+0x12]B [AX-0x12]B [0x1234]B [label]B [0x1234+0x56] [label+0x56] [0x1234+AX] [label+AX] JMP third_label .DAT "hello world with kinda # comment" # actual comment') self.assertListEqual(tokens, [ Token(TokenType.LABEL, 'label', 0), Token(TokenType.LABEL, 'other_label', 7), Token(TokenType.INSTRUCTION, 'MOV', 20), Token(TokenType.WORD_REGISTER, 'AX', 24), Token(TokenType.BYTE_REGISTER, 'AL', 27), Token(TokenType.WORD_LITERAL, 4660, 30), Token(TokenType.BYTE_LITERAL, 18, 37), Token(TokenType.ADDRESS_WORD_LITERAL, 4660, 42), Token(TokenType.ADDRESS_LABEL, 'label', 50), Token(TokenType.ABS_REF_REG, Reference('AX', 0, 'W'), 57), Token(TokenType.ABS_REF_REG, Reference('AX', 18, 'B'), 62), Token(TokenType.ABS_REF_REG, Reference('AX', -18, 'B'), 73), Token(TokenType.REL_REF_WORD, Reference(4660, None, 'B'), 84), Token(TokenType.REL_REF_LABEL, Reference('label', None, 'B'), 94), Token(TokenType.REL_REF_WORD_BYTE, Reference(4660, 86, 'W'), 103), Token(TokenType.REL_REF_LABEL_BYTE, Reference('label', 86, 'W'), 117), Token(TokenType.REL_REF_WORD_REG, Reference(4660, 'AX', 'W'), 130), Token(TokenType.REL_REF_LABEL_REG, Reference('label', 'AX', 'W'), 142), Token(TokenType.INSTRUCTION, 'JMP', 153), Token(TokenType.IDENTIFIER, 'third_label', 157), Token(TokenType.MACRO, 'DAT', 169), Token(TokenType.STRING_LITERAL, 'hello world with kinda # comment', 174), Token(TokenType.COMMENT, ' actual comment', 210) ]) def test_error_unexpected_char(self): with self.assertRaises(UnexpectedCharacterError): self.tokenizer.tokenize('label: mov ?') def test_error_invalid_string_literal(self): with self.assertRaises(InvalidStringLiteralError): self.tokenizer.tokenize('label: mov \'single quote \\\' between single quotes\'') def test_error_unknown_macro(self): with self.assertRaises(UnknownMacroError): self.tokenizer.tokenize('label: .mac x')
# они нужны, чтобы не рассматривать предложения, содержащие # искаженную лексику и т.д. rx1 = re.compile(u'[абвгдеёжзийклмнопрстуфхцчшщъыьэюя]+') dict_words = set() with zipfile.ZipFile(os.path.join(data_folder, 'ruwords.txt.zip')) as z: with z.open('ruwords.txt') as rdr: for line in rdr: word = line.decode('utf-8').strip() if rx1.match(word) is not None: dict_words.add(word) uniq_phrases = set() phrases = [] all_words = set() tokenizer = Tokenizer() for corpus_filepath in glob.glob(os.path.join(data_folder, r'e:\MVoice\lem\dictionary.src\corpus\syntax-ru.*.xml')): print(u'Parsing {}'.format(corpus_filepath)) with codecs.open(corpus_filepath, 'r', 'utf-8') as rdr: for line in rdr: if line.startswith(u'<text>'): line = line.replace(u'<text>', u'').replace(u'</text>', u'').strip() if line not in uniq_phrases: uniq_phrases.add(line) words = tokenizer.tokenize(line) if len(words) <= MAX_SENT_LEN: all_words_known = True for word in words: if word not in dict_words:
class Assembler: ''' Assembler ''' def __init__(self, instruction_set, registers): self.instruction_names = [ inst.__name__ for opcode, inst in instruction_set ] self.instruction_mapping = { inst.__name__: (opcode, inst) for opcode, inst in instruction_set } self.macro_names = ['DAT', 'DATN'] self.word_registers = registers['word'] self.byte_registers = registers['byte'] self.keywords = set(self.instruction_names + self.macro_names + self.word_registers + self.byte_registers) self.tokenizer = Tokenizer({ 'instruction_names': self.instruction_names, 'macro_names': self.macro_names, 'word_registers': self.word_registers, 'byte_registers': self.byte_registers, }) self._reset_state() def assemble_file(self, filename): ''' Assemble source code file and write to executable file ''' logger.info('Assembling %s...', filename) source_code = '' with open(filename, 'rt') as input_file: source_code = input_file.read() opcode = self.assemble_code(source_code) binary_filename = os.path.splitext(filename)[0] exe = Executable(1, opcode) exe.save_to_file(binary_filename) logger.info('Assembled %s (%d bytes).', binary_filename, exe.length) def assemble_code(self, source_code): ''' Assemble source code and return opcode ''' self._reset_state() self.source_code = source_code logger.debug('Tokenizing...') tokenized_code = self._tokenize() logger.debug('Tokenized.') logger.debug('Collecting labels...') self._collect_labels(tokenized_code) logger.debug('Collected.') logger.debug('Generating opcode...') self._generate_opcode(tokenized_code) # generate opcode first time: label addresses not yet good self._generate_opcode(tokenized_code) # generate opcode second time: label addresses good logger.debug('Generated.') self._log_code() return self.opcode def _log_code(self): logger.debug('===CODE===') max_line_opcode_length = max( len(line_opcode) for line_number, opcode_pos, line_opcode, source_line, tokens in self.augmented_opcode ) if max_line_opcode_length > MAX_LINE_OPCODE_LENGTH: max_line_opcode_length = MAX_LINE_OPCODE_LENGTH for line_number, opcode_pos, line_opcode, source_line, tokens in self.augmented_opcode: line_label_names = [token.value for token in tokens if token.type == TokenType.LABEL] if not line_label_names and not line_opcode: continue logger.debug( ' '.join([ '{:4}'.format(line_number), utils.word_to_str(opcode_pos), ' '.join([utils.byte_to_str(op) for op in line_opcode]), ' ' if line_opcode else '', ' ' * (max_line_opcode_length - len(line_opcode)), source_line, ]) ) def _reset_state(self): self.source_code = '' self.labels = {} self.opcode = [] self.augmented_opcode = [] def _tokenize(self): tokenized_code = [] for idx, source_line in enumerate(self.source_code.split('\n')): line_number = idx + 1 try: meaningful_tokens = [ token for token in self.tokenizer.tokenize(source_line) if token.type != TokenType.COMMENT ] except AldebaranError as ex: msg, pos = ex.args _raise_error(source_line, line_number, pos, str(msg), ex.__class__) tokenized_code.append(( line_number, source_line, meaningful_tokens, )) return tokenized_code def _collect_labels(self, tokenized_code): for line_number, source_line, tokens in tokenized_code: for token in tokens: if token.type == TokenType.LABEL: label_name = token.value if label_name in self.labels: _raise_error(source_line, line_number, token.pos, 'Label already defined', LabelError) if label_name in self.keywords: _raise_error(source_line, line_number, token.pos, 'Label name cannot be keyword', LabelError) self.labels[label_name] = 0 def _generate_opcode(self, tokenized_code): self.opcode = [] self.augmented_opcode = [] opcode_pos = 0 for line_number, source_line, tokens in tokenized_code: line_opcode = self._parse_line(line_number, source_line, tokens, opcode_pos) self.opcode += line_opcode self.augmented_opcode.append(( line_number, opcode_pos, line_opcode, source_line, tokens, )) opcode_pos += len(line_opcode) def _parse_line(self, line_number, source_line, tokens, opcode_pos): state = ParserState.LABEL inst_name = None macro_name = None args = [] for token in tokens: if state == ParserState.LABEL: if token.type == TokenType.LABEL: self.labels[token.value] = opcode_pos elif token.type == TokenType.INSTRUCTION: state = ParserState.INSTRUCTION inst_name = token.value elif token.type == TokenType.MACRO: state = ParserState.MACRO macro_name = token.value else: _raise_error(source_line, line_number, token.pos, 'Unexpected token: {}'.format(token.value), ParserError) elif state == ParserState.INSTRUCTION: if token.type in ARGUMENT_TYPES: args.append(token) else: _raise_error(source_line, line_number, token.pos, 'Unexpected token: {}'.format(token.value), ParserError) elif state == ParserState.MACRO: if token.type in ARGUMENT_TYPES: args.append(token) else: _raise_error(source_line, line_number, token.pos, 'Unexpected token: {}'.format(token.value), ParserError) elif state == ParserState.ARGUMENTS: if token.type in ARGUMENT_TYPES: args.append(token) else: _raise_error(source_line, line_number, token.pos, 'Unexpected token: {}'.format(token.value), ParserError) else: _raise_error(source_line, line_number, token.pos, 'Unknown parser state: {}'.format(state), ParserError) if inst_name is not None: line_opcode = self._parse_instruction(inst_name, args, source_line, line_number, opcode_pos) elif macro_name is not None: line_opcode = self._parse_macro(macro_name, args, source_line, line_number, opcode_pos) else: line_opcode = [] return line_opcode def _parse_instruction(self, inst_name, args, source_line, line_number, opcode_pos): inst_opcode, inst = self.instruction_mapping[inst_name] operands = self._parse_operands(args, source_line, line_number, opcode_pos) if len(operands) < inst.operand_count: _raise_error(source_line, line_number, None, 'Not enough operands: {} instead of {}'.format(len(operands), inst.operand_count), OperandError) if len(operands) > inst.operand_count: _raise_error(source_line, line_number, None, 'Too many operands: {} instead of {}'.format(len(operands), inst.operand_count), OperandError) # TODO: check inst.oplens # if None: no check # otherwise list of strings of B|W|* opcode = [inst_opcode] for operand_opcode in operands: opcode += operand_opcode return opcode def _parse_macro(self, macro_name, args, source_line, line_number, opcode_pos): if macro_name == 'DAT': opcode = [] for arg in args: if arg.type == TokenType.STRING_LITERAL: opcode += list(arg.value.encode('utf-8')) elif arg.type == TokenType.BYTE_LITERAL: opcode.append(arg.value) elif arg.type == TokenType.WORD_LITERAL: opcode += utils.word_to_binary(arg.value) else: _raise_error(source_line, line_number, arg.pos, 'Parameter of macro DAT must be a byte, word or string literal, not {}'.format(arg.type), MacroError) return opcode if macro_name == 'DATN': if len(args) != 2: _raise_error(source_line, line_number, None, 'Macro DATN requires exactly 2 parameters, not {}'.format(len(args)), MacroError) repeat_arg, value_arg = args if repeat_arg.type not in {TokenType.BYTE_LITERAL, TokenType.WORD_LITERAL}: _raise_error(source_line, line_number, repeat_arg.pos, 'The first parameter of macro DATN must be a byte or word literal, not {}'.format(repeat_arg.type), MacroError) repeat_number = repeat_arg.value if value_arg.type not in {TokenType.BYTE_LITERAL, TokenType.WORD_LITERAL, TokenType.STRING_LITERAL}: _raise_error(source_line, line_number, value_arg.pos, 'The second parameter of macro DATN must be a byte, word or string literal, not {}'.format(value_arg.type), MacroError) opcode = [] for _ in range(repeat_number): if value_arg.type == TokenType.STRING_LITERAL: opcode += list(value_arg.value.encode('utf-8')) elif value_arg.type == TokenType.BYTE_LITERAL: opcode.append(value_arg.value) else: opcode += utils.word_to_binary(value_arg.value) return opcode # TODO: add more macros _raise_error(source_line, line_number, None, 'Unknown macro: {}'.format(macro_name), MacroError) def _parse_operands(self, args, source_line, line_number, opcode_pos): operands = [] for arg in args: if arg.type == TokenType.STRING_LITERAL: _raise_error(source_line, line_number, arg.pos, 'String literal cannot be instruction operand: {}'.format(arg.value), OperandError) if arg.type in LABEL_REFERENCE_TYPES: arg = self._substitute_label(arg, source_line, line_number, opcode_pos) try: operands.append(get_operand_opcode(arg)) except AldebaranError as ex: orig_msg = '{}({})'.format( ex.__class__.__name__, str(ex), ) arg_name = '{}({})'.format( arg.type.name, arg.value, ) _raise_error( source_line, line_number, arg.pos, 'Could not parse operand {} due to {}'.format(arg_name, orig_msg), OperandError, ) return operands def _substitute_label(self, arg, source_line, line_number, opcode_pos): assert arg.type in LABEL_REFERENCE_TYPES if arg.type == TokenType.ADDRESS_LABEL or arg.type == TokenType.IDENTIFIER: label_name = arg.value else: label_name = arg.value.base try: label_address = self.labels[label_name] except KeyError: _raise_error(source_line, line_number, arg.pos, 'Unknown label reference: {}'.format(arg.value), LabelError) relative_address = label_address - opcode_pos new_type = { TokenType.ADDRESS_LABEL: TokenType.ADDRESS_WORD_LITERAL, TokenType.IDENTIFIER: TokenType.ADDRESS_WORD_LITERAL, TokenType.REL_REF_LABEL_REG: TokenType.REL_REF_WORD_REG, TokenType.REL_REF_LABEL_BYTE: TokenType.REL_REF_WORD_BYTE, TokenType.REL_REF_LABEL: TokenType.REL_REF_WORD, }[arg.type] if arg.type == TokenType.ADDRESS_LABEL or arg.type == TokenType.IDENTIFIER: new_value = relative_address else: new_value = Reference(relative_address, arg.value.offset, arg.value.length) return Token( new_type, new_value, arg.pos, )