class Round(): def __init__(self): self.categories = Categories() self.categories.load() self.alphabet = Alphabet() self.alphabet.load() self.responses = [] self.nextRound() def allResponses(self): return [d['response'] for d in self.responses] def getResponse(self, ptn): log( 'getResponse for ' + ptn ) try: pr = [d for d in self.responses if d['tn'] == ptn] return pr[0] except Exception as e: return { 'tn': ptn, 'valid': False, 'response': 'UNK' } def nextRound(self): self.cat_index = randint( 0, len(self.categories.data)-1) log( self.cat_index) self.alpha_index = randint( 0, len(self.alphabet.data)-1) log( self.alpha_index ) self.responses = [] def describe(self): alpha = self.alphabet.data[self.alpha_index] return self.categories.data[self.cat_index]['category'] + " that " + alpha['position'].lower() + " " + alpha['letter']
def load_config_pos(config_path, char_embedd_dim): max_sent_length, max_char_length, num_labels, embedd_dim_concat = load_config(config_path) alphabet_char = Alphabet('char', keep_growing=False) alphabet_char.load(config_path, 'alphabet_char') alphabet_label = Alphabet('label', keep_growing=False) alphabet_label.load(config_path, 'alphabet_label') scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform(-scale, scale, [alphabet_char.size(), char_embedd_dim]).\ astype(theano.config.floatX) return max_sent_length, max_char_length, num_labels, embedd_dim_concat, alphabet_char, alphabet_label, \ char_embedd_table
def create_alphabets(alphabet_directory, data_paths, max_vocabulary_size, normalize_digits=True): logger = utils.get_logger("Create Alphabets") word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') if not gfile.Exists(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) vocab = dict() for data_path in data_paths: logger.info("Processing data: %s" % data_path) with gfile.GFile(data_path, mode="r") as file: for line in file: line = line.decode('utf-8') line = line.strip() if len(line) == 0: continue tokens = line.split() word = DIGIT_RE.sub( b"0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] for word in vocab_list: word_alphabet.add(word) word_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) type_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) type_alphabet.load(alphabet_directory) word_alphabet.close() pos_alphabet.close() type_alphabet.close() return word_alphabet, pos_alphabet, type_alphabet
import sys from alphabet import Alphabet from regex import Regex if __name__ == '__main__': args = sys.argv if len(args) < 3: print('Usage: python app-regex-simple.py <alphabet> <regex>') sys.exit(1) alpha = Alphabet.load(args[1]) rx = Regex(args[2], alpha) rx.build() for l in sys.stdin: l = l.strip() if rx.match(l): print(l)