def output_radio_data_by_day_multi_month( data_file_1='/Users/ngillani/data/radio/2018_08_single_callsign_show_data.json', data_file_2='/Users/ngillani/data/radio/2018_09_single_callsign_show_data.json', start_date=datetime.date(2018, 8, 15), end_date=datetime.date(2018, 9, 15), output_file='data/radio_data_by_day_mid_aug_mid_sept.csv'): radio_data = [read_dict(data_file_1), read_dict(data_file_2)] cities_to_ids = {} all_data = {'id': [], 'text': []} ref_date = datetime.date(2018, 8, 15) all_days = set() for j in range(0, len(radio_data)): for i, s in enumerate(radio_data[j]): print j, i # if i == 100: break curr_date = datetime.datetime.utcfromtimestamp( s['segment_start_global']).date() if curr_date < start_date or curr_date > end_date: continue day_id = (curr_date - ref_date).days all_days.add(day_id) all_data['id'].append(day_id) all_data['text'].append(s['denorm_content']) print all_days df = pd.DataFrame(data=all_data) df.to_csv(output_file, index=False)
def get_image_files(image_dir, check=False): t = time.time() chinese_dict = read_dict(FLAGS.dict_text) words = list(chinese_dict.keys()) count = 0 image_tupe = [] for f in os.listdir(image_dir): try: if not f.endswith(('.gif', '.jpg', '.png')): continue fp = os.path.join(image_dir, f) if not os.path.isabs(fp): fp = os.path.abspath(fp) if not os.path.exists(fp): continue if check: Image.open(fp) #cv2.imread(fp) label = f.split('_')[1] if is_valid_char(label, words): os.remove(fp) continue if len(label) == 0: os.remove(fp) continue image_tupe.append((fp, label)) count += 1 except Exception as e: print("fn:%s,error: %s", fp, e) os.remove(fp) te = time.time() - t print("cost time:%f, count:%d" % (te, len(image_tupe))) return image_tupe
def train_translation_matrix(source_file, target_file, dict_file, out_file): """Trains a transltion matrix between the source and target languages, using the words in dict_file as anchor points and writing the translation matrix to out_file Note that the source language file and target language file must be in the word2vec C ASCII format :param source_file: The name of the source language file :param target_file: The name of the target language file :param dict_file: The name of the file with the bilingual dictionary :param out_file: The name of the file to write the translation matrix to """ log.info("Reading the training data") train_data = read_dict(dict_file) #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) log.info("Reading: %s" % source_file) source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() log.info("Reading: %s" % target_file) target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() log.debug('Words in the source space: %s' % source_sp.row2id) log.debug('Words in the target space: %s' % target_sp.row2id) log.info("Learning the translation matrix") log.info("Training data: %s" % str(train_data)) tm = train_tm(source_sp, target_sp, train_data) log.info("Printing the translation matrix") np.savetxt(out_file, tm)
def output_radio_data_by_geo( data_file='/Users/ngillani/data/radio/2018_09_single_callsign_show_data.json', output_file='data/radio_data_by_city_and_state.csv', output_mapping_file='data/radio_by_city_and_state_mapping.json'): radio_data = read_dict(data_file) cities_to_ids = {} all_data = {'id': [], 'text': []} for i, s in enumerate(radio_data): print i geo = s['city'] + ', ' + s['state'] if not geo in cities_to_ids: cities_to_ids[geo] = len(cities_to_ids) all_data['id'].append(cities_to_ids[geo]) all_data['text'].append(s['denorm_content']) df = pd.DataFrame(data=all_data) df.to_csv(output_file, index=False) f = open(output_mapping_file, 'w') f.write(json.dumps(cities_to_ids, indent=4)) f.close()
def output_radio_data_by_day( data_file='/Users/ngillani/data/radio/2018_09_single_callsign_show_data.json', output_file='data/radio_data_by_day.csv'): radio_data = read_dict(data_file) cities_to_ids = {} all_data = {'id': [], 'text': []} ref_date = datetime.date(2018, 9, 1) all_days = set() for i, s in enumerate(radio_data): print i # if i == 100: break curr_date = datetime.datetime.utcfromtimestamp( s['segment_start_global']).date() day_id = (curr_date - ref_date).days all_days.add(day_id) all_data['id'].append(day_id) all_data['text'].append(s['denorm_content']) print all_days df = pd.DataFrame(data=all_data) df.to_csv(output_file, index=False)
def get_image_files2(image_dir, check=False): t = time.time() im_names = [] # glob.glob(os.path.join(image_dir, '*.{jpg,png,gif}')) for ext in ('*.png', '*.jpg', '*.gif'): im_names.extend(glob.glob(os.path.join(image_dir, ext))) chinese_dict = read_dict(FLAGS.dict_text) words = list(chinese_dict.keys()) count = 0 image_tupe = [] for im_name in im_names: try: if not os.path.exists(im_name): continue if check: Image.open(im_name) # cv2.imread(fp) label = im_name.split('_')[1] if is_valid_char(label, words): os.remove(im_name) continue if len(label) == 0: os.remove(im_name) continue image_tupe.append((im_name, label)) count += 1 except Exception as e: print("fn:%s,error: %s", im_name, e) os.remove(im_name) te = time.time() - t print("cost time:%f, count:%d" % (te, len(image_tupe))) return image_tupe
def train_wrapper(seed_fn, source_fn, target_fn, reverse=False, mx_path=None, train_size=5000): logging.info("Training...") seed_trans = read_dict(seed_fn, reverse=reverse) #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words = set(seed_trans.iterkeys()) target_words = set().union(*seed_trans.itervalues()) source_sp = Space.build(source_fn, lexicon=source_words) source_sp.normalize() target_sp = Space.build(target_fn, lexicon=target_words) target_sp.normalize() logging.info("Learning the translation matrix") tm, used_for_train = train_tm(source_sp, target_sp, seed_trans, train_size) mx_path = default_output_fn(mx_path, seed_fn, source_fn, target_fn,) logging.info("Saving the translation matrix to {}".format(mx_path)) np.save('{}.npy'.format(mx_path), tm) pickle.dump(used_for_train, open('{}.train_wds'.format(mx_path), mode='w')) return tm, used_for_train
def _read_token(self): token_file = os.path.join(utils.get_user_home(), TOKEN_FILENAME) if os.path.exists(token_file): res = utils.read_dict(token_file) if res: self.id = res.get('douban_user_id') self.tk = res.get('access_token') return self.is_authorized()
def output_radio_data_by_day_multi_month_for_continuous_context( data_file_1='/Users/ngillani/data/radio/2018_08_single_callsign_show_data.json', data_file_2='/Users/ngillani/data/radio/2018_09_single_callsign_show_data.json', knot_size=3, start_date=datetime.date(2018, 8, 15), end_date=datetime.date(2018, 9, 15), output_file='data/radio_data_by_day_mid_aug_mid_sept_continuous.csv'): radio_data = [read_dict(data_file_1), read_dict(data_file_2)] # radio_data = [read_dict(data_file_1)] cities_to_ids = {} all_data = {'id': [], 'attr': [], 'standardized_attr': [], 'text': []} ref_date = datetime.date(2018, 8, 15) all_days = set() for j in range(0, len(radio_data)): for i, s in enumerate(radio_data[j]): print j, i # if i == 10000: break curr_date = datetime.datetime.utcfromtimestamp( s['segment_start_global']).date() if curr_date < start_date or curr_date > end_date: continue day_id = (curr_date - ref_date).days all_days.add(day_id) all_data['attr'].append(day_id) all_data['text'].append(s['denorm_content']) all_days = list(all_days) all_days_mean = np.mean(all_days) all_days_std = np.std(all_days) knots = list( range(np.min(all_days), np.max(all_days) + knot_size, knot_size)) for d in all_data['attr']: all_data['standardized_attr'].append( float(d - all_days_mean) / all_days_std) all_data['id'].append(get_knot_id(knots, d)) df = pd.DataFrame(data=all_data) df.to_csv(output_file, index=False) print knots
def api_lookup(): n = int(request.args.get('n', 5)) assert n > 0, 'No pronunciations requested' voice = request.args.get('voice', None) profile = request_to_profile(request, profiles_dirs) ps_config = profile.speech_to_text['pocketsphinx'] espeak_config = profile.text_to_speech['espeak'] word = request.data.decode('utf-8').strip().lower() assert len(word) > 0, 'No word to look up' logging.debug('Getting pronunciations for %s' % word) # Load base and custom dictionaries base_dictionary_path = profile.read_path(ps_config['base_dictionary']) custom_path = profile.read_path(ps_config['custom_words']) word_dict = {} for word_dict_path in [base_dictionary_path, custom_path]: if os.path.exists(word_dict_path): with open(word_dict_path, 'r') as dictionary_file: utils.read_dict(dictionary_file, word_dict) result = utils.lookup_word(word, word_dict, profile, n=n) # Get phonemes from eSpeak espeak_command = ['espeak', '-q', '-x'] if voice is None: if 'voice' in espeak_config: # Use profile voice voice = espeak_config['voice'] elif 'language' in profile.json: # Use language default voice voice = profile.json['language'] espeak_command.extend(['-v', voice, word]) logging.debug(espeak_command) result['espeak_phonemes'] = subprocess.check_output( espeak_command).decode() return jsonify(result)
def main(sys_argv): try: opts, argv = getopt.getopt(sys_argv[1:], "ho:", ["help", "output="]) except getopt.GetoptError as err: print(str(err)) usage() sys.exit(1) out_file = "./tm" for opt, val in opts: if opt in ("-o", "--output"): out_file = val elif opt in ("-h", "--help"): usage(0) else: usage(1) if len(argv) == 3: source_file = argv[1] target_file = argv[2] dict_file = argv[0] else: print(str(err)) usage(1) print("Reading the training data") train_data = read_dict(dict_file) #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) print("Reading: %s" % source_file) source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() print("Reading: %s" % target_file) target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() print("Learning the translation matrix") tm = train_tm(source_sp, target_sp, train_data) print("Printing the translation matrix") np.savetxt("%s.txt" % out_file, tm)
def __init__(self, root, dictionary_file): """ Contains the game logic variable initialisation. root -- the root window, dictionary_file -- the path to the file which contains the word dictionary. """ self.root = root self.gameBoard = ['-' * GRID_SIZE] * GRID_SIZE self.gameBoard[GRID_SIZE // 2] = '-' * (GRID_SIZE // 2) + '0' + '-' * (GRID_SIZE // 2) self.lg = LetterGenerator() self.sm = ScoreManager() self.dictionary = utils.read_dict(dictionary_file) self.active_player = 0 self.player1 = {'name': "Player1", 'letters': self.lg.draw(MAX_HAND_SIZE), 'score': 0} self.player2 = {'name': "Player2", 'letters': self.lg.draw(MAX_HAND_SIZE), 'score': 0}
plt.show() #关闭 coord.request_stop() coord.join(threads) def write_dict(): cs = open("resource/gb2312_list.txt", 'r').read() index = 134 with open("resource/new_dic2.txt", 'a') as f: for c in cs: f.write("%d\t%c\n" % (index, c)) index = index + 1 #python gen_record.py --dataset_name=train --dataset_dir=out --dataset_nums=1024 --output_dir=datasets/train if __name__ == '__main__': chinese_dict = read_dict(FLAGS.dict_text) # make_tfrecord2(chinese_dict, FLAGS.dataset_name, FLAGS.dataset_nums) # write_dict() # words = open("resource/gb2312_list.txt", 'r').read() # print(words) parse_tfrecord_file() # # import datasets # print(getattr(datasets, "my_data")) pass
if len(argv) == 4: tm_file = argv[0] test_file = argv[1] source_file = argv[2] target_file = argv[3] else: print str(err) usage(1) print "Loading the translation matrix" tm = np.loadtxt(tm_file) print "Reading the test data" test_data = read_dict(test_file) # in the _source_ space, we only need to load vectors for the words in test. # semantic spaces may contain additional words, ALL words in the _target_ # space are used as the search space source_words, _ = zip(*test_data) source_words = set(source_words) print "Reading: %s" % source_file if not additional: source_sp = Space.build(source_file, source_words) else: # read all the words in the space lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, comments=None, usecols=(0,)).flatten()) # the max number of additional+test elements is bounded by the size
out_file = val elif opt in ("-h", "--help"): usage(0) else: usage(1) if len(argv) == 3: source_file = argv[1] target_file = argv[2] dict_file = argv[0] else: print str(err) usage(1) print "Reading the training data" train_data = read_dict(dict_file) print train_data #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) print "Reading: %s" % source_file source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() print "Reading: %s" % target_file target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() print "Learning the translation matrix" tm = train_tm(source_sp, target_sp, train_data)
args['train_data'] = 'data/cw' # 训练数据路径 args['test_data'] = 'data/cw' # 测试数据路径 args['batch_size'] = 64 # 每一批用来训练的样本数 args['epoch'] = 10 # 迭代次数 args['hidden_dim'] = 100 # lstm cell输出的数据的维度 args['optimizer'] = 'Adam' # 优化损失函数的方法 args['lr'] = 0.001 # 学习率 args['clip'] = 5.0 # 限定梯度更新的时候的阈值 args['dropout'] = 0.5 # 保留率 args[ 'update_embedding'] = True # 是否要对embedding进行更新,embedding初始化之后,这里设置成更新,就可以更新embedding args['embedding_dim'] = 100 # embedding的维度 args['shuffle'] = True # 是否每次在把数据送进lstm中训练时都混洗 # 读取词典,把一个字映射到一个id,这个词典是从训练数据中得到的 word2id = read_dict(os.path.join('.', args['train_data'], 'word2id.pkl')) # 随机初始化embedding embeddings = init_embedding(word2id, args['embedding_dim']) # 设置模型的输出路径 model_path = 'BLCM3' output_path = os.path.join('.', model_path) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") #if not os.path.exists(model_path): #os.makedirs(model_path)
TEST_FILENAME_TO_SEGMENT_DATA = os.path.join(DIR_PATH, 'data/filename_to_segment_ids.csv') parser = argparse.ArgumentParser() parser.add_argument("file_path", type=Path) p = parser.parse_args() # Load model if (p.file_path.exists()): model = load_model(p.file_path.as_posix()) model.summary() else: exit("The given file path does not exist: ", p.file_path) # Data generator for test input_dim = 400 partition = read_dict(PARTITION_PATH) test_generator = BreakfastActionTestDataGenerator(partition['testing'], batch_size=1, input_dim=input_dim) # Predict using model (returns probabilities) print("Getting predictions...") predictions = model.predict_generator(test_generator, use_multiprocessing=True, workers=4, verbose=2) model_name = p.file_path.as_posix().split("runs/", 1)[1] # model name will have the .hdf5 extension timestr = time.strftime("_%Y%m%d_%H%M%S") # Save raw predictions
checkpoint_filename = "./runs/frame-simple-dnn-{epoch:02d}-{val_categorical_accuracy:.2f}.hdf5" checkpoint = ModelCheckpoint(checkpoint_filename, save_best_only=True, mode='min', monitor='val_loss', verbose=1) callbacks_list = [checkpoint] # Compile model model.compile(adagrad, loss='categorical_crossentropy', metrics=['categorical_accuracy']) model.summary() # Load indices for training, testing, and validation partition = read_dict(PARTITION_PATH) # Load labels labels = read_dict(VIDEO_LABELS_PATH) # Data generators for train/validation training_generator = BreakfastActionTrainDataGenerator(partition['training'], labels=labels, batch_size=batch_size, input_dim=input_dim, output_dim=output_dim, shuffle=True) validation_generator = BreakfastActionTrainDataGenerator( partition['validation'], labels=labels, batch_size=batch_size,
def train(profile): stt_config = profile.speech_to_text intent_config = profile.intent # Load sentence templates, write examples sentences_ini_path = profile.read_path(stt_config['sentences_ini']) # Load from ini file and write to examples file words_needed = set() sentences_by_intent = defaultdict(list) grammars_dir = profile.write_dir(stt_config['grammars_dir']) with open(sentences_ini_path, 'r') as sentences_ini_file: grammar_paths = jsgf_utils.make_grammars(sentences_ini_file, grammars_dir) # intent -> sentence templates tagged_sentences = jsgf_utils.generate_sentences(grammar_paths) for intent_name, intent_sents in tagged_sentences.items(): for intent_sent in intent_sents: # Template -> untagged sentence + entities sentence, entities = utils.extract_entities(intent_sent) # Split sentence into words (tokens) sentence, tokens = sanitize_sentence(sentence, profile.training) sentences_by_intent[intent_name].append((sentence, entities)) # Collect all used words words_needed.update(tokens) # Load base and custom dictionaries ps_config = stt_config['pocketsphinx'] base_dictionary_path = profile.read_path(ps_config['base_dictionary']) custom_path = profile.read_path(ps_config['custom_words']) word_dict = {} for word_dict_path in [base_dictionary_path, custom_path]: if os.path.exists(word_dict_path): with open(word_dict_path, 'r') as dictionary_file: utils.read_dict(dictionary_file, word_dict) # Add words from wake word if using pocketsphinx if profile.wake.get('system') == 'pocketsphinx': wake_config = profile.wake['pocketsphinx'] wake_keyphrase = wake_config['keyphrase'] _, wake_tokens = sanitize_sentence(wake_keyphrase, profile.training) words_needed.update(wake_tokens) # Check for unknown words unknown_words = words_needed - word_dict.keys() unknown_path = profile.read_path(ps_config['unknown_words']) if len(unknown_words) > 0: with open(unknown_path, 'w') as unknown_file: for word in unknown_words: result = utils.lookup_word(word, word_dict, profile, n=1) pronounces = result['pronunciations'] phonemes = ' '.join(pronounces) # Dictionary uses upper-case letters if stt_config.get('dictionary_upper', False): word = word.upper() else: word = word.lower() print(word.lower(), phonemes, file=unknown_file) raise RuntimeError('Training failed due to %s unknown word(s)' % len(unknown_words)) elif os.path.exists(unknown_path): # Remove unknown dictionary os.unlink(unknown_path) # Write out dictionary with only the necessary words (speeds up loading) dictionary_path = profile.write_path(ps_config['dictionary']) with open(dictionary_path, 'w') as dictionary_file: for word in sorted(words_needed): for i, pronounce in enumerate(word_dict[word]): if i < 1: print(word, pronounce, file=dictionary_file) else: print('%s(%s)' % (word, i+1), pronounce, file=dictionary_file) logging.debug('Wrote %s word(s) to %s' % (len(words_needed), dictionary_path)) # Repeat sentences so that all intents will contain the same number balance_sentences = profile.training.get('balance_sentences', True) if balance_sentences: # Use least common multiple lcm_sentences = utils.lcm(*(len(sents) for sents in sentences_by_intent.values())) else: lcm_sentences = 0 # no repeats # Write sentences to text file sentences_text_path = profile.write_path(stt_config['sentences_text']) with open(sentences_text_path, 'w') as sentences_text_file: num_sentences = 0 for intent_name, intent_sents in sentences_by_intent.items(): num_repeats = max(1, lcm_sentences // len(intent_sents)) for sentence, slots in intent_sents: for i in range(num_repeats): print(sentence, file=sentences_text_file) num_sentences = num_sentences + 1 logging.debug('Wrote %s sentence(s) to %s' % (num_sentences, sentences_text_path)) # Generate ARPA language model lm = train_speech_recognizer(profile) # Save to profile lm_path = profile.write_path(ps_config['language_model']) with open(lm_path, 'w') as lm_file: lm_file.write(lm) # Expand sentences for intent recognizer intent_system = profile.intent.get('system', 'fuzzywuzzy') if intent_system == 'rasa': rasa_config = profile.intent[intent_system] # Use rasaNLU examples_md_path = profile.write_path(rasa_config['examples_markdown']) with open(examples_md_path, 'w') as examples_md_file: for intent_name, intent_sents in tagged_sentences.items(): # Rasa Markdown training format print('## intent:%s' % intent_name, file=examples_md_file) for intent_sent in intent_sents: print('-', intent_sent, file=examples_md_file) print('', file=examples_md_file) # Train rasaNLU project_dir = profile.write_dir(rasa_config['project_dir']) project_name = rasa_config['project_name'] rasa_config_path = profile.read_path(rasa_config['config']) train_intent_recognizer(examples_md_path, rasa_config_path, project_dir, project_name) else: fuzzy_config = profile.intent[intent_system] # Use fuzzywuzzy examples_path = profile.write_path(fuzzy_config['examples_json']) examples = intent.make_examples(profile, sentences_by_intent) with open(examples_path, 'w') as examples_file: json.dump(examples, examples_file, indent=4)
index = [i for i in range(len(iterm)) if iterm[i] == 1] if len(index) == 0: result.append(['unk']) elif len(index) == 1: result.append([label[index[0]]]) else: temp = [label[i] for i in index] result.append(['|'.join(temp)]) return result if __name__ == "__main__": args = get_args() df_test = pd.read_csv(args["test_file"]) test_data = df_test["content"].tolist() label = read_dict(args["labeldict"])["label"] pred = main(test_data, args, len(label)) print("pred: ", pred[:2]) pred_encoder = label_encoder(pred, label) print("label encoder: ", pred_encoder[:3]) pred_df = pd.DataFrame(pred_encoder, columns=["pred"]) df_test = pd.concat([df_test, pred_df], axis=1) #df_test.append(pred_df, ignore_index=True) df_test = df_test[["label", "pred", "content"]] df_test.to_csv("./output/test_predict.csv", index=False, header=True, encoding="utf-8-sig") # test_data = ["昨天18:30,陕西宁强县胡家坝镇向家沟村三组发生山体坍塌,5人被埋。当晚,3人被救出,其中1人在医院抢救无效死亡," # "2人在送医途中死亡。今天凌晨,另外2人被发现,已无生命迹象。"]
统计单字词的个数 Args: words_list: list 单词列表 Return: count: int 单字词个数 """ count = 0 for word in word_list: if len(word) == 1: count += 1 return count if __name__ == '__main__': words_dict = read_dict('./data/assign/dic_ce.txt') max_len = len(max(words_dict, key=len)) test = "我正在上自然语言处理课。" segment = MaxSegmentation(words_dict, test, max_len) fstart = time.time() f_result = segment.ForwardMM() fend = time.time() print("ForwardMM: {}, running time: {} s".format(f_result, str(fend - fstart))) rstart = time.time() r_result = segment.ReverseMM() rend = time.time() print("ReverseMM: {}, running time: {} s".format(r_result, str(rend - rstart))) bistart = time.time() bi_result = segment.BMM(f_result, r_result)
def main(): args = get_args() train_df = pd.read_csv(args["train_file"]) train_df = shuffle(train_df) train_datas = train_df["content"].tolist() train_label_total = train_df["label"].unique().tolist() print("total data size: {}".format(len(train_datas))) # get lable dict label_list = read_dict(args["labeldict"])["label"] if not os.path.exists(args["labeldict"]): for label in train_label_total: if "|" in label: temp = label.split("|") for item in temp: if item not in label_list: label_list.append(item) else: if label not in label_list: label_list.append(label) print("label cate size: {}".format(len(label_list))) label_dict = {"label": label_list} with open(args["labeldict"], "w", encoding="utf-8") as f: f.write(json.dumps(label_dict, ensure_ascii=False, indent=4)) # label encoder train_labels = label_encoder(train_df["label"].tolist(), label_list) train_data, val_data, train_label, val_label = train_test_split( train_datas, train_labels, test_size=0.2, random_state=0) print("train data size: {}".format(len(train_data))) print("val data size: {}".format(len(val_data))) tokenizer = get_tokenizer(args["bert_model_name"], args["pretrain_model_path"]) train_x, train_y = get_model_data(train_data, train_label, tokenizer, args["max_length"]) val_x, val_y = get_model_data(val_data, val_label, tokenizer, args["max_length"]) model = create_model(args["bert_model_name"], len(label_list)) if not os.path.exists(args["model_path"]): os.makedirs(args["model_path"]) if not os.path.exists(args["pbmodel_path"]): os.makedirs(args["pbmodel_path"]) # 设置保存最优的模型,保存的是pb模型 callbacks = [ tf.keras.callbacks.ModelCheckpoint( # Path where to save the model # The two parameters below mean that we will overwrite # the current checkpoint if and only if # the `val_loss` score has improved. # The saved model name will include the current epoch. filepath=args["model_path"], # {epoch} save_best_only=True, # Only save a model if `val_loss` has improved. monitor='val_auc', # 'accuracy', verbose=1, mode='max') ] model.fit(train_x, train_y, epochs=args["epoch"], verbose=1, batch_size=args["batch_size"], callbacks=callbacks, validation_data=(val_x, val_y), validation_batch_size=args["batch_size"]) model_path = os.path.join("./output/model/", "mulclassifition.h5") model.save_weights(model_path) tf.keras.models.save_model(model, args["pbmodel_path"], save_format="tf", overwrite=True)
if len(argv) == 4: tm_file = argv[0] test_file = argv[1] source_file = argv[2] target_file = argv[3] else: print str(err) usage(1) print "Loading the translation matrix" tm = np.loadtxt(tm_file) print "Reading the test data" test_data = read_dict(test_file) #in the _source_ space, we only need to load vectors for the words in test. #semantic spaces may contain additional words, ALL words in the _target_ #space are used as the search space source_words, _ = zip(*test_data) source_words = set(source_words) print "Reading: %s" % source_file if not additional: source_sp = Space.build(source_file, source_words) else: #read all the words in the space lexicon = set( np.loadtxt(source_file, skiprows=1,
#! /usr/bin/env python3 import csv from utils import read_dict, write_md filename = "dict.csv" rows = read_dict(filename) write_md("dict.md", rows)
PARTITION_PATH = os.path.join(DIR_PATH, 'data/segment_partition.csv') parser = argparse.ArgumentParser() parser.add_argument("file_path", type=Path) p = parser.parse_args() # Load model if (p.file_path.exists()): model = load_model(p.file_path.as_posix()) model.summary() else: exit("The given file path does not exist: ", p.file_path) # Data generator for test input_dim = 400 partition = read_dict(PARTITION_PATH) test_generator = BreakfastActionTestDataGenerator(partition['testing'], batch_size=1, input_dim=input_dim) # Predict using model (returns probabilities) print("Getting predictions...") predictions = model.predict_generator(test_generator, use_multiprocessing=True, workers=4, verbose=2) # Save raw predictions model_name = p.file_path.as_posix().split("runs/", 1)[1] # model name will have the .hdf5 extension timestr = time.strftime("%Y%m%d_%H%M%S") print("Writing predictions...")
def main(sys_argv): try: opts, argv = getopt.getopt(sys_argv[1:], "ho:c:", ["help", "output=", "correction="]) except getopt.GetoptError as err: print(str(err)) usage() sys.exit(1) out_file = "./translated_vecs" additional = None for opt, val in opts: if opt in ("-o", "--ouput"): out_file = val if opt in ("-c", "--correction"): try: additional = int(val) except ValueError: usage(1) elif opt in ("-h", "--help"): usage(0) else: usage(1) if len(argv) == 4: tm_file = argv[0] test_file = argv[1] source_file = argv[2] target_file = argv[3] else: # print(str(err)) usage(1) print("Loading the translation matrix") tm = np.loadtxt(tm_file) print("Reading the test data") test_data = read_dict(test_file) #in the _source_ space, we only need to load vectors for the words in test. #semantic spaces may contain additional words, ALL words in the _target_ #space are used as the search space source_words, _ = zip(*test_data) source_words = set(source_words) print("Reading: %s" % source_file) if not additional: source_sp = Space.build(source_file, source_words) else: #read all the words in the space lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, comments=None, usecols=(0,)).flatten()) #the max number of additional+test elements is bounded by the size #of the lexicon additional = min(additional, len(lexicon) - len(source_words)) #we sample additional elements that are not already in source_words random.seed(100) lexicon = random.sample(list(lexicon.difference(source_words)), additional) #load the source space source_sp = Space.build(source_file, source_words.union(set(lexicon))) source_sp.normalize() print("Reading: %s" % target_file) target_sp = Space.build(target_file) target_sp.normalize() print("Translating") #translates all the elements loaded in the source space mapped_source_sp = apply_tm(source_sp, tm) print("Retrieving translations") test_data = get_valid_data(source_sp, target_sp, test_data) #turn test data into a dictionary (a word can have mutiple translation) gold = collections.defaultdict(set) for k, v in test_data: gold[k].add(v) score(mapped_source_sp, target_sp, gold, additional) print("Printing mapped vectors: %s" % out_file) np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat) np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
checkpoint_filename = "./runs/segment-lstm-{epoch:02d}-{val_categorical_accuracy:.2f}.hdf5" checkpoint = ModelCheckpoint(checkpoint_filename, monitor='val_loss', save_best_only=True, mode='min', verbose=1) callbacks_list = [checkpoint] # Compile model (use default Adam optimizer) model.compile('adam', loss='categorical_crossentropy', metrics=['categorical_accuracy']) model.summary() # Load indices for training, testing, and validation partition = read_dict(PARTITION_PATH) # Load labels labels = read_dict(SEGMENT_LABELS_PATH) # Data generators for train/validation training_generator = BreakfastActionTrainDataGenerator(partition['training'], labels=labels, batch_size=batch_size, input_dim=input_dim, output_dim=output_dim, shuffle=True) validation_generator = BreakfastActionTrainDataGenerator( partition['validation'], labels=labels, batch_size=batch_size,
def main(sys_argv): try: opts, argv = getopt.getopt(sys_argv[1:], "ho:c:l:m:1:2:t:a:v:", [ "help", "output=", "correction=", "levenshtein=", "matrix=", "1=", "2=", "topK=", "alpha=", "verbosity=" ]) except getopt.GetoptError as err: print(str(err)) usage() sys.exit(1) out_file = "./translated_vecs" additional = None levcosts = {} for opt, val in opts: # print(opt+'='+val) if opt in ("-o", "--ouput"): out_file = val elif opt in ("-l", "--levenshtein"): levcosts = u.readcosts(val) elif opt in ("-m", "--matrix"): tm_file = val elif opt == '-1': source_file = val elif opt == '-2': target_file = val elif opt in ("-c", "--correction"): try: additional = int(val) except ValueError: print("additional: %s" % val) usage(1) elif opt in ("-t", "--topK"): try: u.topK = int(val) except ValueError: print("topK: %s" % val) usage(1) elif opt in ("-v", "--verbosity"): try: u.verbosity = int(val) except ValueError: print("verbosity: %s" % val) usage(1) elif opt in ("-a", "--alpha"): try: u.alpha = float(val) except ValueError: print("alpha: %s" % val) usage(1) elif opt in ("-h", "--help"): usage(0) else: print("Unknown option: -%s %s" % (opt, val)) usage(1) if len(argv) == 1: test_file = argv[0] else: print('Unused arguments:') print(argv) usage(1) #if u.verbosity>0: # always log the parameters in the output sys.stdout.write(sys_argv[0] + " ") for opt, val in opts: sys.stdout.write(opt + " " + val + " ") print(test_file) if u.verbosity > 1: print("Loading the translation matrix %s " % tm_file) tm = np.loadtxt(tm_file) if u.verbosity > 1: print("Reading the test data %s " % test_file) test_data = u.read_dict(test_file) #in the _source_ space, we only need to load vectors for the words in test. #semantic spaces may contain additional words, ALL words in the _target_ #space are used as the search space source_words, _ = zip(*test_data) source_words = set(source_words) if u.verbosity > 1: print("Reading: %s" % source_file) if not additional: source_sp = Space.build(source_file, source_words) else: #read all the words in the space with io.open(source_file, 'r', encoding='utf8') as f: lexicon = set([l.split(' ')[0] for l in f]) # lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, # comments=None, usecols=(0,)).flatten()) #the max number of additional+test elements is bounded by the size #of the lexicon additional = min(additional, len(lexicon) - len(source_words)) #we sample additional elements that are not already in source_words random.seed(100) if additional > 0: lexicon = random.sample(list(lexicon.difference(source_words)), additional) #load the source space source_sp = Space.build(source_file, source_words.union(set(lexicon))) source_sp.normalize() if u.verbosity > 1: print("Reading: %s" % target_file) target_sp = Space.build(target_file) target_sp.normalize() if u.verbosity > 1: print("Retrieving translations") test_data = u.get_valid_data(source_sp, target_sp, test_data) #turn test data into a dictionary (a word can have mutiple translation) gold = collections.defaultdict(set) for k, v in test_data: gold[k].add(v) if u.verbosity > 1: print("Translating" ) #translates all the elements loaded in the source space source_sp = u.apply_tm(source_sp, tm) u.score(source_sp, target_sp, gold, additional, levcosts) print("Printing mapped vectors: %s" % out_file) np.savetxt("%s.vecs.txt" % out_file, source_sp.mat) # np.savetxt("%s.wds.txt" % out_file, source_sp.id2row, fmt="%s") # no utf8 with open("%s.wds.txt" % out_file, "w") as outf: for s in source_sp.id2row: print(s, file=outf)
line = line.strip() if (len(line) == 0) or line.startswith('#'): continue # skip blanks and comments parts = line.split(' ', maxsplit=1) phonemes[parts[0]] = parts[1] # Load dictionaries dictionary_files = config['training']['dictionary_files'] word_dict = defaultdict(set) for dict_path in dictionary_files: if not os.path.exists(dict_path): continue logging.debug('Loading dictionary from %s' % dict_path) read_dict(dict_path, word_dict) # --------------------------------------------------------------------- # Create web server app = Flask('rhasspy', template_folder=os.path.join('web', 'templates')) app.secret_key = str(uuid4()) # Automatically reload template files if they're changed on disk. # Used for debugging/development. app.config['TEMPLATES_AUTO_RELOAD'] = True # --------------------------------------------------------------------- # Static Routes # ---------------------------------------------------------------------
usage(0) else: usage(1) if len(argv) == 5: source_file = argv[1] target_file = argv[2] test_file = argv[3] model = eval(argv[4]) dict_file = argv[0] else: print str(err) usage(1) print "Reading the training data" train_data = read_dict(dict_file) print train_data #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) print "Reading: %s" % source_file source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() print "Reading: %s" % target_file target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() print "Learning the translation matrix" tm = train_tm_model(source_sp, target_sp, train_data, model)
elif opt in ("-h", "--help"): usage(0) else: usage(1) if len(argv) == 3: source_file = argv[1] target_file = argv[2] dict_file = argv[0] else: print str(err) usage(1) print "Reading the training data" train_data = read_dict(dict_file) #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) print "Reading: %s" % source_file source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() print "Reading: %s" % target_file target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() print "Learning the translation matrix" print "Training data: %s" % str(train_data)