def __init__(self,train_dist_path, test_dist_path,dis_phone_num,prob_trans_path): f = open(train_dist_path,'r') # train_prob ln=f.readline() temp_prob_str = ln.split(' ')[3:-1] self.train_prob = torch.FloatTensor(list(map(float, temp_prob_str))) # var ln=f.readline() temp_var_str = ln.split(' ')[3:-1] self.train_var = torch.FloatTensor(list(map(float, temp_var_str))) f.close() # test_prob f = open(test_dist_path,'r') ln=f.readline() temp_prob_str = ln.split(' ')[3:-1] self.test_prob = torch.FloatTensor(list(map(float, temp_prob_str))) # var ln=f.readline() temp_var_str = ln.split(' ')[3:-1] self.test_var = torch.FloatTensor(list(map(float, temp_var_str))) f.close() # ratio length=len(self.test_prob) self.prob_ratio = torch.min(self.test_prob/self.train_prob, torch.ones(length)) print('self.prob_ratio:') print(self.prob_ratio) assert max(self.prob_ratio) <=1 , 'The prob ratio is larger than 1!?' assert min(self.test_var) > 0, 'The min of test var is <= 0!?' self.std_var_ratio = torch.sqrt(self.test_var/self.train_var) print('self.std_var_ratio:') print(self.std_var_ratio) print('self.test_var.sum():') print(self.test_var.sum()) # The first 4 are zero unvoiced parts self.phone_prob = torch.FloatTensor([0,0,0,0, 0.0202, 0.0522, 0.0917, 0.0153, 0.0088, 0.0483, 0.0130, 0.0048, 0.0290, 0.0212, 0.0249, 0.0177, 0.0240, 0.0146, 0.0093, 0.0194, 0.0490, 0.0457, 0.0050, 0.0296, 0.0367, 0.0407, 0.0530, 0.0114, 0.0416, 0.0011, 0.0124, 0.0302, 0.0457, 0.0073, 0.0571, 0.0064, 0.0047, 0.0249, 0.0123, 0.0191, 0.0287, 0.0230, 0.0002]) self.prob_sum = self.phone_prob.sum() phone_num = len(self.phone_prob) assert dis_phone_num >=0 , 'The dis_phone_num need to be non negtive!' self.dis_phone_num = dis_phone_num threshold_prob = self.prob_sum/((phone_num-dis_phone_num)*0.8) self.prob_ratio_upper = torch.max(self.phone_prob/threshold_prob, 0.2*torch.ones(phone_num)) # load the GMM params temp_mat = sio.loadmat(prob_trans_path) self.mu_ratio = torch.from_numpy(temp_mat['mu_ratio']).float() # weight cumulation sum self.comp_wcum = torch.from_numpy(temp_mat['comp_wcum']).float()
def guess_dim(fname): """ This funcion guesses the dimensionality of the embedding. There can be two types of files: with a header (vocabsize and dim separated by a space) and without a header (the first line is the first embedding vector). """ with open(fname) as F: first_line = F.readline().strip() p = first_line.split() if len(p) == 2: # this is the header line return int(p[1]) else: # this is the first embedding vector return len(p) - 1 #first element is the word itself.
def load_embeddings(self, words, embedding_file): """Load pretrained embeddings for a given list of words, if they exist. Args: words: iterable of tokens. Only those that are indexed in the dictionary are kept. embedding_file: path to text file of embeddings, space separated. """ emb_layer = self.network.embedder.word_embeddings words = {w for w in words if w in self.src_dict} logger.info('Loading pre-trained embeddings for %d words from %s' % (len(words), embedding_file)) # When normalized, some words are duplicated. (Average the embeddings). vec_counts, embedding = {}, {} with open(embedding_file, encoding='utf8') as f: # Skip first line if of form count/dim. line = f.readline().rstrip().split(' ') if len(line) != 2: f.seek(0) duplicates = set() for line in tqdm(f, total=count_file_lines(embedding_file)): parsed = line.rstrip().split(' ') assert (len(parsed) == emb_layer.word_vec_size + 1) w = self.src_dict.normalize(parsed[0]) if w in words: vec = torch.Tensor([float(i) for i in parsed[1:]]) if w not in vec_counts: vec_counts[w] = 1 embedding[w] = vec else: duplicates.add(w) vec_counts[w] = vec_counts[w] + 1 embedding[w].add_(vec) if len(duplicates) > 0: logging.warning('WARN: Duplicate embedding found for %s' % ', '.join(duplicates)) for w, c in vec_counts.items(): embedding[w].div_(c) emb_layer.init_word_vectors(self.src_dict, embedding, self.args.fix_embeddings) logger.info('Loaded %d embeddings (%.2f%%)' % (len(vec_counts), 100 * len(vec_counts) / len(words)))
def read_ann(file_p): anns = [] with open(file_p, 'r') as F: while True: line = F.readline() if line == '': break if line == '\n': continue ann = line.strip().split('\t') # print(ann) anns.append(ann) # break header_list = ["pmid", "off_a", "off_b", "name", "type", "id", 's_i'] anns = pd.DataFrame(anns, columns=header_list) return anns