def __init__(self, opt): self.train_data = None self.dev_data = None self.test_data = None self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) self.train_texts = None self.train_Ids = None self.dev_texts = None self.dev_Ids = None self.test_texts = None self.test_Ids = None self.pretrain_word_embedding = None self.word_emb_dim = opt.word_emb_dim self.config = self.read_config(opt.config) self.feat_config = None the_item = 'ner_feature' if the_item in self.config: self.feat_config = self.config[the_item] ## [POS]:{emb_size:20} self.feature_alphabets = [] self.feature_emb_dims = [] for k, v in self.feat_config.items(): self.feature_alphabets.append(Alphabet(k)) self.feature_emb_dims.append(int(v['emb_size']))
def __init__(self): super(VsmNormer, self).__init__() self.word_alphabet = Alphabet('word') self.embedding_dim = None self.word_embedding = None self.dict_alphabet = Alphabet('dict') self.dict_embedding = None self.gpu = opt.gpu
def load_config_pos(config_path, char_embedd_dim): max_sent_length, max_char_length, num_labels, embedd_dim_concat = load_config(config_path) alphabet_char = Alphabet('char', keep_growing=False) alphabet_char.load(config_path, 'alphabet_char') alphabet_label = Alphabet('label', keep_growing=False) alphabet_label.load(config_path, 'alphabet_label') scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform(-scale, scale, [alphabet_char.size(), char_embedd_dim]).\ astype(theano.config.floatX) return max_sent_length, max_char_length, num_labels, embedd_dim_concat, alphabet_char, alphabet_label, \ char_embedd_table
def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" self.char_features = "LSTM" ## "LSTM"/"CNN" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_average_batch_loss = False self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 50 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0
def __init__(self): self.categories = Categories() self.categories.load() self.alphabet = Alphabet() self.alphabet.load() self.responses = [] self.nextRound()
def __init__(self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@"): Source.__init__(self, data, encoding=encoding) self._Instance = BinaryClassificationInstance if feature_alphabet != None: self._feature_alphabet = feature_alphabet else: self._feature_alphabet = Alphabet(locked=False) self._sep = sep self._bias = bias self._bias_prefix = bias_prefix if alphabet_pop: self._populate_alphabet() if alphabet_lock: self.lock_alphabet() else: self.unlock_alphabet() return
def initial_feature_alphabets(self): for l in open(self.train_dir, 'r').readlines(): if not l.startswith("#") and not l.startswith("-BOS-"): items = l.strip("\n").split() break total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [self.HP_feature_default_size ] * self.feature_num #self.feature_emb_dims = [20]*self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm']
def __init__(self): self.states = State() self.sigma = Alphabet() self.delta = list() self.delta_nfa = list() self.initial_state = None self.final_state = list()
def forward(self,Y,h,c, outEncoder,teacher_force):# Y это кол-во символов умножить на 256 if (np.random.rand()>teacher_force): seq_len=Y.shape[0]-1 output_decoder= load_to_cuda(torch.autograd.Variable(torch.zeros(seq_len, h.shape[1], 48))) Y = self.embedding(Y) for i in range(len(Y)-1): # -1 так как sos не учитывем в criterion h[0],c[0] = self.lstm1(Y[i],(h[0].clone(),c[0].clone())) h[1],c[1] = self.lstm2(h[0].clone(),(h[1].clone(),c[1].clone())) h[2],c[2] = self.lstm3(h[1].clone(),(h[2].clone(),c[2].clone())) h2=h[2].clone() context = self.attention(h2, outEncoder,BATCH_SIZE) context = torch.bmm( context,outEncoder.view(outEncoder.shape[1],outEncoder.shape[0],-1) ) # print("context",context.shape) # torch sueeze output_decoder[i] = self.MLP(torch.cat( (h2,torch.squeeze(context,1)) ,1 )) else: seq_len=Y.shape[0]-1 output_decoder= load_to_cuda(torch.autograd.Variable(torch.zeros(seq_len, h.shape[1], 48))) alphabet = Alphabet() Y_cur = self.embedding( load_to_cuda(Variable(torch.LongTensor([alphabet.ch2index('<sos>')]))) ).view(1,self.hidden_size) for i in range(seq_len-1): Y_cur=Y_cur.expand(BATCH_SIZE,self.hidden_size) h[0],c[0] = self.lstm1(Y_cur,(h[0].clone(),c[0].clone())) h[1],c[1] = self.lstm2(h[0].clone(),(h[1].clone(),c[1].clone())) h[2],c[2] = self.lstm3(h[1].clone(),(h[2].clone(),c[2].clone())) h2 = h[2].clone() context = self.attention(h2, outEncoder,BATCH_SIZE) context = torch.bmm( context,outEncoder.view(outEncoder.shape[1],outEncoder.shape[0],-1) ) output_decoder[i] = self.MLP(torch.cat( (h2,torch.squeeze(context,1)) ,1 )) argmax = torch.max(output_decoder[i][0],dim=0) Y_cur=self.embedding( Variable(load_to_cuda(torch.LongTensor([argmax[1][0].data[0]]))) ).view(1,self.hidden_size) return output_decoder
def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() print items total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" print "feature_prefix:{}".format(feature_prefix) self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm']
def __init__(self): self.name2id = {} # preferred name -> id self.id2name = {} # id -> CTD_Term self.altid2id = {} # alternative id -> id if opt.method == 'cla': self.id_alphabet = Alphabet('id')
def test_given_alphabet_as_int_returns_error(self): test_data = 123456 try: Alphabet('Test', test_data) self.assertFalse(True, "Expected exception") except: return
def main(): UPPER_STRING = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" testAlphabet = Alphabet(UPPER_STRING) permutation1 = Permutation( "(AELTPHQXRU) (BKNW) (CMOY) (DFG) (IV) (JZ) (S)", testAlphabet) permutation2 = Permutation( "(FIXVYOMW) (CDKLHUP) (ESZ) (BJ) (GR) (NT) (A) (Q)", testAlphabet) permutation3 = Permutation("(ABDHPEJT) (CFLVMZOYQIRWUKXSG) (N)", testAlphabet) permutation4 = Permutation("(AEPLIYWCOXMRFZBSTGJQNH) (DV) (KU)", testAlphabet) permutation5 = Permutation( "(AE) (BN) (CK) (DQ) (FU) (GY) (HW) (IJ) (LO) (MP) (RX) (SZ) (TV)", testAlphabet) rotor1 = Rotor("I", permutation1, "TG") rotor2 = Rotor("II", permutation2, "A") rotor3 = Rotor("III", permutation3, "B") rotor4 = Rotor("IV", permutation4, "XO") reflector = Reflector("A", permutation5) rotors = [reflector, rotor4, rotor3, rotor2, rotor1] machine = Machine(testAlphabet, 5, 6, rotors) machine.insertRotors(["A", "IV", "III", "II", "I"]) machine.setRotors("AAAA") message = input("What to convert:") print(machine.convertMsg(message))
def __init__(self, args): # Alphabet self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) # data self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.input_size = 0 self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 # hyper parameters self.HP_word_emb_dim = args.embedding_size self.HP_char_emb_dim = args.char_embedding_size self.HP_iteration = args.max_epoch self.HP_batch_size = args.batch_size self.HP_char_hidden_dim = args.char_hidden_dim self.HP_hidden_dim = args.hidden_size self.HP_dropout = args.dropout self.HP_char_dropout = args.char_dropout self.HP_use_char = True if args.char_encoder else False self.HP_char_features = args.char_encoder self.HP_gpu = torch.cuda.is_available() and args.gpu self.HP_lr = args.lr self.HP_model_name = args.model_name self.HP_encoder_type = args.encoder self.HP_optim = args.optim self.HP_number_normalized = args.number_normalized self.HP_seed = args.seed self.HP_l2 = args.l2 self.HP_kernel_size = args.kernel_size self.HP_kernel_num = args.kernel_num
def test_cross_off_adds_guessed_letter_to_list_of_guessed_letters(self): # arrange alphabet = Alphabet() letter = "a" # act alphabet.cross_off(letter) # assert assert letter in alphabet.guessed_letters
def test_already_guessed_returns_true_if_letter_guessed(self): # arrange alphabet = Alphabet() letter = "h" alphabet.cross_off(letter) # act result = alphabet.already_guessed("h") # assert assert result is True
def decode_all(manifests): data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1, keep_transcription_text=True) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 alphabet = Alphabet(args.vocab_path) ds2_model.logger.info("start decoding with extended output...") ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, args.trie_path, alphabet) for audioname, manifest_path, duration, offset in manifests: try: duration_f = float(duration) if duration_f < 1.: yield (audioname, manifest_path, None, duration, offset) continue except (TypeError, ValueError): pass batch_reader = data_generator.batch_reader_creator( manifest_path=manifest_path, batch_size=args.num_samples, min_batch_size=1, sortagrad=False, shuffle_method=None) for decode_data in batch_reader(): probs_split = ds2_model.infer_batch_probs( infer_data=decode_data, feeding_dict=data_generator.feeding) # note: we only perform single file decoding result_transcript = ds2_model.decode_beam_search( probs_split=probs_split, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, alphabet=alphabet) yield (audioname, manifest_path, result_transcript, duration, offset)
def __init__(self, input_file): self.original_data = open(input_file, 'r').readlines() self.index_data = [] self.word_alphabet = Alphabet('word') self.gloss_alphabet = Alphabet('gloss') self.entity_alphabet = Alphabet('entity') self.gaz_alphabet = Alphabet('gaz') self.label_alphabet = Alphabet('label') self.word_alphabet_size = 0 self.gloss_alphabet_size = 0 self.entity_alphabet_size = 0 self.gaz_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 1 self.HP_gaz_hidden_dim = 50 self.HP_lstm_hidden_dim = 200 self.HP_dropout = 0.5 self.gaz_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = False self.HP_use_entity = False self.HP_use_gloss = True self.HP_use_gaz = False self.HP_gpu = True self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 self.HP_iteration = 100 # embedding hyperparameter self.word_emb_dim = 200 self.entity_emb_dim = 50 self.gloss_features = "CNN" #["CNN","LSTM"] self.gloss_emb_dim = 200 self.gloss_hidden_dim = 300 self.pretrain_word_embedding = np.array([]) self.pretrain_gaz_embedding = None self.word_embed_path = "../LOVECC/NYM.6B.200d.txt" #"NYM_200.txt" self.gaz_embed_path = None self.gaz_emb_dim = 200 self.HP_fix_gaz_emb = True
def get_word(seq): # seq-числа #print(seq) alphabet=Alphabet() s="" if len(seq)==0: return s for el in seq: #print("el:",el.data) s+=alphabet.index2ch(el) return s
def test_str_representation_does_not_show_hidden_letters(self): # arrange alphabet = Alphabet() word = Word(alphabet) word.word_to_guess = "aardvark" word.guess_letter("a") # act hidden_word = str(word) # assert assert hidden_word == "aa___a__"
def make_alphabet(): alphabet = Alphabet(0) load_dataset("%s/%s.train.txt" % (data, dataset), alphabet) load_dataset("%s/%s.valid.txt" % (data, dataset), alphabet) if dataset == 'ptb': load_dataset("%s/%s.test.txt" % (data, dataset), alphabet) # add all the words in all three dataset print("%s: total %d words" % (dataset, len(alphabet))) pickle.dump(alphabet, open("%s/alphabet.pkl" % data, "wb"))
def __init__(self, args): self.config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) if args.dataset not in self.config['data_list']: raise KeyError("No such dataset named {}.".format(args.dataset)) self.config['dataset'] = args.dataset self.datatype = 'binary' if self.config['dataset'] in self.config['datatype']['train_test']: self.datatype = 'train_test' self.alphabet = Alphabet('word') self.set_seed()
def __init__(self): self.MAX_SENTENCE_LENGTH = 512 self.MAX_WORD_LENGTH = -1 self.number_normalized = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.word_alphabet.add(START) self.word_alphabet.add(UNKNOWN) self.char_alphabet.add(START) self.char_alphabet.add(UNKNOWN) self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label') self.tagScheme = "NoSeg" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.word_emb_dim = 50 self.pretrain_word_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_batch_size = 10 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0 self.HP_clip = 5.0 self.HP_momentum = 0
def str_to_id(): global alphabet_pos, alphabet_chunk, alphabet_tag alphabet_pos = Alphabet('pos') train_pos_id = map_string_2_id(train_pos, alphabet_pos) alphabet_pos.close() dev_pos_id = map_string_2_id(dev_pos, alphabet_pos) test_pos_id = map_string_2_id(test_pos, alphabet_pos) alphabet_chunk = Alphabet('chunk') train_chunk_id = map_string_2_id(train_chunk, alphabet_chunk) alphabet_chunk.close() dev_chunk_id = map_string_2_id(dev_chunk, alphabet_chunk) test_chunk_id = map_string_2_id(test_chunk, alphabet_chunk) alphabet_tag = Alphabet('tag') train_tag_id = map_string_2_id(train_tag, alphabet_tag) alphabet_tag.close() dev_tag_id = map_string_2_id(dev_tag, alphabet_tag) test_tag_id = map_string_2_id(test_tag, alphabet_tag)
def init_alphabet(self): # Read entire train/val/test data to deterimine set of unique characters we should have in alphabet unique_chars = set() for split in ['train', 'validation', 'test']: for entry in self.data_desc[split]: for char in entry['trans'].split(): unique_chars.add(char) # Now add CTC blank as first letter in alphabet. Also sort alphabet lexigraphically for convinience self.alphabet = Alphabet(['<ctc-blank>', *sorted(unique_chars)])
def initial_feature_alphabets(self): feature_prefix = '[Cap]' self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = 0 feature_prefix = '[POS]' self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = 1 self.feature_num = len(self.feature_alphabets) self.feature_emb_dims = [20] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size']
def map_string_2_id_open(string_list, name): string_id_list = [] alphabet_string = Alphabet(name) for strings in string_list: ids = [] for string in strings: id = alphabet_string.get_index(string) ids.append(id) string_id_list.append(ids) alphabet_string.close() return string_id_list, alphabet_string
def main(): data_dir = 'tweets/hashtag_top100_smileys_tweets_{}.gz' output_dir_tweets = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.tweets.npy' output_dir_hashtags = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.hashtags.npy' outdir = 'parsed_tweets' alphabet_words = Alphabet(start_feature_id=0) alphabet_words.add('UNKNOWN_WORD_IDX') alphabet_words.add('DUMMY_WORD_IDX') dummy_word_idx = DUMMY_WORD_IDX alphabet_hashtags = Alphabet(start_feature_id=0) alphabet_hashtags.add('UNKNOWN_HASHTAG_IDX') inp = 'train' store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp)) inp = 'test' store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp)) cPickle.dump(alphabet_words, open(os.path.join(outdir, 'vocab_words.pickle'), 'w')) cPickle.dump(alphabet_hashtags, open(os.path.join(outdir, 'vocab_hashtags.pickle'), 'w'))
def time_stamp_calc(self): time = floor(self.creation_time) # choosing a seed value to compare, in this case, the date im writing this code seed = 10012019 alpha = Alphabet() alpha.shuffle() index = (time % seed) # digit_one = alpha[temp_time % seed] # alpha = alpha.shuffle() # digit_two = alpha[temp_time % seed] return index
def __init__(self): config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) config = AttrDict(config) self.config = config self.tokenizer = BertTokenizer.from_pretrained(config.bert_path, do_lower_case=False) if not os.path.exists(self.config.data_dir): os.makedirs(self.config.data_dir) self.CLS, self.SEP = config.CLS, config.SEP self.label_alphabet = Alphabet('label', padflag=False, unkflag=False, init_list=['O', self.CLS, self.SEP])