def get_embs_labels(k, allembs, allcnts, reverse_dict, plot_only): embs_list = [] labels_list = [] norm_cnts, mask = filter_words(allcnts, 0.25) is_untrained_embs = np.count_nonzero(norm_cnts) == 0 for i in range(k): embs = [] labels = [] for j in plot_only: if(mask[j][i] or is_untrained_embs): embs.append(allembs[i][j, :]) embs_list.append(embs) if k == 1: for j in plot_only: if(mask[j][i] or is_untrained_embs): labels.append( reverse_dict[j]+'_{}'.format(int(allcnts[j][i]))) else: for j in plot_only: if(mask[j][i] or is_untrained_embs): labels.append(reverse_dict[j]+'_'+str(i) \ + '_{}'.format(int(allcnts[j][i]))) labels_list += labels return embs_list, labels_list
def main(): parser = get_parser() args = parser.parse_args() if args.doc: print __doc__ sys.exit() g = geosearchclass.GeoSearchClass() if args.params: print 'Using parameters from ' + str(args.params) # turn parameter file into dictionary g.set_params_from_file(args.params) if args.address: print "Finding geocoordates for address:\n{}".format(args.address) coords = geo_converter.get_geocoords_from_address(args.address) if coords: g.latitude = coords[0] print "Found this latitude:" print g.latitude g.longitude = coords[1] print "Found this longitude:" print g.longitude else: print "Failed to find coordinates. Exiting." sys.exit() if args.input: text = utils.load_file(args.input) tokens = utils.tokenize_normal_words(text) for_poem = utils.filter_words(tokens) else: for_poem = get_default_words() if args.markov: if args.input: raise StandardError("Can only input a single text file. \ use --markov <your_text_file.txt>") else: text = utils.load_file(args.markov) # ngram = ngrams.make_ngram(text, 2) ngram = ngrams.make_bigram_trigram_dictionary(text) formatted_poem = create_poem(g, for_poem, ngram) else: formatted_poem = create_poem(g, for_poem) if args.output: print '\nwriting formatted poem to ' + str(args.output) output_file = args.output else: print "\nwriting formatted poem to poem.txt" output_file = "poem.txt" utils.save_file(output_file, formatted_poem)
async def inline_handler(inline_query: types.InlineQuery): text = inline_query.query.lower() if not text or inline_query.from_user.id not in VIP and (await amt_donated(inline_query.from_user.id)) < 10: await inline_query.answer( [ types.InlineQueryResultArticle( id=str(uuid4()), title="Start a classic game", description="/startclassic@on9wordchainbot", input_message_content=types.InputTextMessageContent("/startclassic@on9wordchainbot"), ), types.InlineQueryResultArticle( id=str(uuid4()), title="Start a hard mode game", description="/starthard@on9wordchainbot", input_message_content=types.InputTextMessageContent("/starthard@on9wordchainbot"), ), types.InlineQueryResultArticle( id=str(uuid4()), title="Start a chaos game", description="/startchaos@on9wordchainbot", input_message_content=types.InputTextMessageContent("/startchaos@on9wordchainbot"), ), types.InlineQueryResultArticle( id=str(uuid4()), title="Start a chosen first letter game", description="/startcfl@on9wordchainbot", input_message_content=types.InputTextMessageContent("/startcfl@on9wordchainbot"), ), types.InlineQueryResultArticle( id=str(uuid4()), title="Start a banned letters game", description="/startbl@on9wordchainbot", input_message_content=types.InputTextMessageContent("/startbl@on9wordchainbot"), ), types.InlineQueryResultArticle( id=str(uuid4()), title="Start a required letter game", description="/startrl@on9wordchainbot", input_message_content=types.InputTextMessageContent("/startrl@on9wordchainbot"), ), types.InlineQueryResultArticle( id=str(uuid4()), title="Start an elimination game", description="/startelim@on9wordchainbot", input_message_content=types.InputTextMessageContent("/startelim@on9wordchainbot"), ), ], is_personal=not text, ) return if any(c not in ascii_lowercase for c in text): await inline_query.answer( [ types.InlineQueryResultArticle( id=str(uuid4()), title="A query can only consist of alphabets", description="Try a different query", input_message_content=types.InputTextMessageContent(r"¯\\_(ツ)\_/¯"), ) ], is_personal=True, ) return res = [] for i in filter_words(starting_letter=text[0]): if i.startswith(text): i = i.capitalize() res.append( types.InlineQueryResultArticle( id=str(uuid4()), title=i, input_message_content=types.InputTextMessageContent(i), ) ) if len(res) == 50: # Max 50 results break if not res: # No results res.append( types.InlineQueryResultArticle( id=str(uuid4()), title="No results found", description="Try a different query", input_message_content=types.InputTextMessageContent(r"¯\\_(ツ)\_/¯"), ) ) await inline_query.answer(res, is_personal=True)
def train(config, sample_validation_batches): source_language = config.get('src_language') target_language = config.get('trg_language') EOS_token = config.get('EOS_token') PAD_token = config.get('PAD_token') SOS_token = config.get('SOS_token') train_iter = config.get('train_iter') val_iter = config.get('val_iter') writer_path = config.get('writer_path') writer_train_path = get_or_create_dir(writer_path, 'train') writer_val_path = get_or_create_dir(writer_path, 'val') writer_train = SummaryWriter(log_dir=writer_train_path) writer_val = SummaryWriter(log_dir=writer_val_path) epochs = config.get('epochs') training = config.get('training') eval_every = training.get('eval_every') sample_every = training.get('sample_every') use_attention = config.get('use_attention') step = 1 for epoch in range(epochs): print(f'Epoch: {epoch+1}/{epochs}') save_weights(config) for i, training_batch in enumerate(train_iter): loss = train_batch(config, training_batch) writer_train.add_scalar('loss', loss, step) if step == 1 or step % eval_every == 0: val_lengths = 0 val_losses = 0 reference_corpus = [] translation_corpus = [] for val_batch in val_iter: val_loss, translations = evaluate_batch(config, val_batch) val_lengths += 1 val_losses += val_loss val_batch_trg, _ = val_batch.trg _, batch_size = val_batch_trg.shape references = map( lambda i: torch2words(target_language, val_batch_trg[:, i]), range(batch_size)) references = map( lambda words: [ list( filter_words(words, SOS_token, EOS_token, PAD_token)) ], references) reference_corpus.extend(references) translations = map( lambda translation: list2words( target_language, translation), translations) translations = map( lambda words: list( filter_words(words, SOS_token, EOS_token, PAD_token )), translations) translation_corpus.extend(translations) bleu = compute_bleu(reference_corpus, translation_corpus) val_loss = val_losses / val_lengths writer_val.add_scalar('bleu', bleu, step) writer_val.add_scalar('loss', val_loss, step) if step % sample_every == 0: val_batch = sample_validation_batches(1) val_batch_src, val_lengths_src = val_batch.src val_batch_trg, _ = val_batch.trg s0 = val_lengths_src[0].item() _, translations, attention_weights = evaluate_batch( config, val_batch, True) source_words = torch2words(source_language, val_batch_src[:, 0]) target_words = torch2words(target_language, val_batch_trg[:, 0]) translation_words = list( filter(lambda word: word != PAD_token, list2words(target_language, translations[0]))) if use_attention and sum(attention_weights.shape) != 0: attention_figure = visualize_attention( source_words[:s0], translation_words, with_cpu(attention_weights)) writer_val.add_figure('attention', attention_figure, step) text = get_text(source_words, target_words, translation_words, SOS_token, EOS_token, PAD_token) writer_val.add_text('translation', text, step) step += 1 save_weights(config)
def main(): args = check_argv() feat_type = "mfcc" list_dir = path.join("lists", args.language) if not path.isdir(list_dir): os.makedirs(list_dir) feat_dir = path.join(feat_type, args.language) if not path.isdir(feat_dir): os.makedirs(feat_dir) # All ground truth word segments with pronunciations for subset in ["dev"]: #, "eval", "train"]: list_fn = path.join(list_dir, subset + ".all_gt_words.list") pronunciations_fn = path.join(list_dir, subset + ".prons") # Read forced alignments and obtain pronunciations word_fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm") phone_fa_fn = path.join( # gp_alignments_dir, args.language, subset + ".phone.ctm" gp_alignments_dir, args.language, subset + ".phone.ipa.ctm") if not path.isfile(phone_fa_fn): print("Warning: IPA pronunciations not found") phone_fa_fn = path.join(gp_alignments_dir, args.language, subset + ".phone.ctm") pronunciations_dict = pronunciations_from_fa(word_fa_fn, phone_fa_fn) # Write pronunciation list if not path.isfile(pronunciations_fn): print("Writing:", pronunciations_fn) with codecs.open(pronunciations_fn, "w", "utf-8") as f: for segment_key in sorted(pronunciations_dict): f.write(segment_key + " " + ",".join(pronunciations_dict[segment_key]) + "\n") else: print("Using existing file:", pronunciations_fn) # Write word list if not path.isfile(list_fn): print("Writing:", list_fn) with codecs.open(list_fn, "w", "utf-8") as f: for segment_key in sorted(pronunciations_dict): f.write(segment_key + "\n") else: print("Using existing file:", list_fn) # Write individual phone list phone_list_fn = path.join(list_dir, subset + ".phone.list") if not path.isfile(phone_list_fn): utils.filter_words(phone_fa_fn, phone_list_fn, min_frames=5, min_chars=0) else: print("Using existing file:", phone_list_fn) # Filter phones print("Reading:", phone_list_fn) phone_segment_keys = [] with codecs.open(phone_list_fn, "r", "utf-8") as f: for line in f: phone_segment_keys.append(line.strip()) phone_filtered_keys = filter_segment_keys(phone_segment_keys, n_max_tokens=5000) phone_filtered_list_fn = path.join(list_dir, subset + ".filter1_phone.list") print("Writing:", phone_filtered_list_fn) if not path.isfile(phone_filtered_list_fn): with codecs.open(phone_filtered_list_fn, "w", "utf-8") as f: for segment_key in sorted(phone_filtered_keys): f.write(segment_key + "\n") else: print("Using existing file:", phone_filtered_list_fn) # Extract phone segments from the MFCC NumPy archives input_npz_fn = path.join("..", "features", feat_type, args.language, args.language.lower() + "." + subset + ".npz") output_npz_fn = path.join( feat_dir, args.language.lower() + "." + subset + ".filter1_phone.npz") if not path.isfile(output_npz_fn): utils.segments_from_npz(input_npz_fn, phone_filtered_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) if args.analyse: import matplotlib.pyplot as plt import numpy as np # Most common words labels = [i.split("_")[0] for i in pronunciations_dict] counter = Counter(labels) print("No. word types:", len(counter)) print("No. word tokens:", len(labels)) print("Most common words:", counter.most_common(10)) # Histogram of word count counts = counter.values() plt.figure() plt.hist(counts, 50) plt.yscale("log") plt.ylabel("No. of types with this many tokens") plt.xlabel("No. of tokens") # # Temp # # Most common words # labels = [i.split("_")[0] for i in filtered_keys] # counter = Counter(labels) # print("No. word types:", len(counter)) # print("No. word tokens:", len(labels)) # print("Most common words:", counter.most_common(10)) # # Histogram of word count # counts = counter.values() # plt.figure() # plt.hist(counts, 50) # plt.yscale("log") # plt.ylabel("No. of types with this many tokens") # plt.xlabel("No. of tokens") plt.show() # Filter 1 print("Applying filter 1") n_min_tokens_per_type = 10 n_max_tokens_per_type = 25 filtered_keys = filter_segment_keys(list(pronunciations_dict), n_min_tokens_per_type, n_max_tokens_per_type) print("No. tokens:", len(filtered_keys)) print("No. types:", len(set([i.split("_")[0] for i in filtered_keys]))) filtered_list_fn = path.join(list_dir, subset + ".filter1_gt.list") print("Writing:", filtered_list_fn) if not path.isfile(filtered_list_fn): with codecs.open(filtered_list_fn, "w", "utf-8") as f: for segment_key in sorted(filtered_keys): f.write(segment_key + "\n") else: print("Using existing file:", filtered_list_fn) # Extract word segments from the MFCC NumPy archives input_npz_fn = path.join("..", "features", feat_type, args.language, args.language.lower() + "." + subset + ".npz") output_npz_fn = path.join( feat_dir, args.language.lower() + "." + subset + ".filter1_gt.npz") if not path.isfile(output_npz_fn): utils.segments_from_npz(input_npz_fn, filtered_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn)
import utils import numpy as np from gensim.models import Word2Vec import warnings warnings.filterwarnings("ignore") # 加载Word2Vec模型 model = Word2Vec.load(model_file) # 读取数据 df_all = pd.read_csv(processed_data_file, encoding='utf-8', nrows=train_num) contents = list(df_all['Query_List'].values) words = [[word for word in content.split(' ')] for content in contents] words_list = utils.filter_words(words) print('开始构造word2vec特征......') w2v_feat = np.zeros((len(words_list), w2v_dim)) # w2v_dim = 300 # 词向量的维度 w2v_feat_avg = np.zeros((len(words_list), w2v_dim)) i = 0 for words in words_list: num = 0 for word in words: vec = model[word] w2v_feat[i, :] += vec # 属于同一文档的词向量直接相加 num += 1 w2v_feat_avg[i, :] = w2v_feat[i, :] / num # 属于同一个文档的词向量加权平均 i += 1
def main(): args = check_argv() feat_type = "mfcc" # RAW FEATURES # Extract MFCCs for the different sets feat_dir = path.join(feat_type, args.language) if not path.isdir(feat_dir): os.makedirs(feat_dir) for subset in ["dev", "eval", "train"]: raw_feat_fn = path.join(feat_dir, args.language.lower() + "." + subset + ".npz") if not path.isfile(raw_feat_fn): print("Extracting MFCCs:", subset) extract_features_for_subset(args.language, subset, feat_type, raw_feat_fn) else: print("Using existing file:", raw_feat_fn) # assert False # GROUND TRUTH WORD SEGMENTS list_dir = path.join("lists", args.language) if not path.isdir(list_dir): os.makedirs(list_dir) for subset in ["dev", "eval", "train"]: # Create a ground truth word list (at least 50 frames and 5 characters) fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm") list_fn = path.join(list_dir, subset + ".gt_words.list") if not path.isfile(list_fn): if args.language == "KO": min_frames = 26 min_chars = 3 elif args.language == "TH": min_frames = 38 min_chars = 2 elif args.language == "VN": min_frames = 30 min_chars = 4 else: min_frames = 50 min_chars = 5 utils.filter_words(fa_fn, list_fn, min_frames=min_frames, min_chars=min_chars) else: print("Using existing file:", list_fn) # Extract word segments from the MFCC NumPy archives input_npz_fn = path.join(feat_dir, args.language.lower() + "." + subset + ".npz") output_npz_fn = path.join( feat_dir, args.language.lower() + "." + subset + ".gt_words.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for ground truth word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # UTD-DISCOVERED WORD SEGMENTS # Change Enno Hermann's pair file to the appropriate format enno_pairs_fn = path.join( "..", "data", args.language, # "pairs_sw_utd.train" "pairs_sw_utd_plp_vtln.train") if not path.isfile(enno_pairs_fn): # This might not be an evaluation language return pairs_fn = path.join("lists", args.language, "train.utd_pairs.list") if not path.isfile(pairs_fn): utils.format_enno_pairs(enno_pairs_fn, pairs_fn) else: print("Using existing file:", pairs_fn) list_fn = path.join("lists", args.language, "train.utd_terms.list") if not path.isfile(list_fn): print("Reading:", pairs_fn) terms = set() with codecs.open(pairs_fn, "r", "utf-8") as pairs_f: for line in pairs_f: term1, term2 = line.strip().split(" ") terms.add(term1) terms.add(term2) print("Writing:", list_fn) with codecs.open(list_fn, "w", "utf-8") as list_f: for term in sorted(terms): list_f.write(term + "\n") else: print("Using existing file:", list_fn) # Extract UTD segments input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz") output_npz_fn = path.join(feat_dir, args.language.lower() + ".train.utd_terms.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD word tokens") utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # UTD SEGMENTS THAT HAVE BEEN PARTIALLY FIXED # Write list with fixed labels and segments fixed_labels_list_fn = path.join("lists", args.language, "train.utd_terms.fixed_labels.list") fixed_segs_list_fn = path.join("lists", args.language, "train.utd_terms.fixed_segs.list") fixed_labels_segs_list_fn = path.join( "lists", args.language, "train.utd_terms.fixed_labels_segs.list") if (not path.isfile(fixed_labels_list_fn) or not path.isfile(fixed_labels_segs_list_fn) or not path.isfile(fixed_segs_list_fn)): # Read UTD terms utd_list_fn = path.join("lists", args.language, "train.utd_terms.list") print("Reading:", utd_list_fn) # overlap_dict[speaker_utt][(start, end)] is a tuple of # (label, (start, end), overlap, cluster_label) overlap_dict = {} with codecs.open(utd_list_fn, "r", "utf-8") as utd_list_f: for line in utd_list_f: term, speaker, utt, start_end = line.strip().split("_") start, end = start_end.split("-") start = int(start) end = int(end) if not speaker + "_" + utt in overlap_dict: overlap_dict[speaker + "_" + utt] = {} overlap_dict[speaker + "_" + utt][(start, end)] = ("label", (0, 0), 0, term) # Read forced alignments fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm") print("Reading:", fa_fn) fa_dict = {} with codecs.open(fa_fn, "r", "utf-8") as fa_f: for line in fa_f: utt_key, _, start, duration, label = line.strip().split() start = float(start) duration = float(duration) end = start + duration start_frame = int(round(start * 100)) end_frame = int(round(end * 100)) if (label != "<unk>" and label != "sil" and label != "?" and label != "spn"): if not utt_key in fa_dict: fa_dict[utt_key] = {} fa_dict[utt_key][start_frame, end_frame] = label # Find ground truth terms with maximal overlap print("Getting ground truth terms with maximal overlap:") for utt_key in tqdm(fa_dict): # print(utt_key) if utt_key not in overlap_dict: continue for (fa_start, fa_end) in fa_dict[utt_key]: for (utd_start, utd_end) in overlap_dict[utt_key]: overlap = get_overlap(utd_start, utd_end, fa_start, fa_end) if overlap == 0: continue if (overlap > overlap_dict[utt_key][(utd_start, utd_end)][2]): overlap_dict[utt_key][(utd_start, utd_end)] = ( fa_dict[utt_key][(fa_start, fa_end)], (fa_start, fa_end), overlap, overlap_dict[utt_key][(utd_start, utd_end)][3]) # Write list with fixed labels if not path.isfile(fixed_labels_list_fn): print("Writing:", fixed_labels_list_fn) with codecs.open(fixed_labels_list_fn, "w", "utf-8") as list_f: for utt_key in sorted(overlap_dict): for (utd_start, utd_end) in overlap_dict[utt_key]: label = overlap_dict[utt_key][(utd_start, utd_end)][0] overlap = (overlap_dict[utt_key][(utd_start, utd_end)][2]) if overlap != 0: list_f.write("{}_{}_{:06d}-{:06d}\n".format( label, utt_key, utd_start, utd_end)) else: print("Using existing file:", fixed_labels_list_fn) # Write list with fixed labels and segment intervals if not path.isfile(fixed_labels_segs_list_fn): print("Writing:", fixed_labels_segs_list_fn) with (codecs.open(fixed_labels_segs_list_fn, "w", "utf-8")) as list_f: for utt_key in sorted(overlap_dict): for (utd_start, utd_end) in overlap_dict[utt_key]: label = overlap_dict[utt_key][(utd_start, utd_end)][0] fa_start, fa_end = (overlap_dict[utt_key][( utd_start, utd_end)][1]) overlap = (overlap_dict[utt_key][(utd_start, utd_end)][2]) if overlap != 0: list_f.write("{}_{}_{:06d}-{:06d}\n".format( label, utt_key, fa_start, fa_end)) else: print("Using existing file:", fixed_labels_segs_list_fn) # Write list with fixed segment intervals if not path.isfile(fixed_segs_list_fn): print("Writing:", fixed_segs_list_fn) with (codecs.open(fixed_segs_list_fn, "w", "utf-8")) as list_f: for utt_key in sorted(overlap_dict): for (utd_start, utd_end) in overlap_dict[utt_key]: label = overlap_dict[utt_key][(utd_start, utd_end)][3] fa_start, fa_end = (overlap_dict[utt_key][( utd_start, utd_end)][1]) overlap = (overlap_dict[utt_key][(utd_start, utd_end)][2]) if overlap != 0: list_f.write("{}_{}_{:06d}-{:06d}\n".format( label, utt_key, fa_start, fa_end)) else: print("Using existing file:", fixed_segs_list_fn) else: print("Using existing file:", fixed_labels_list_fn) print("Using existing file:", fixed_segs_list_fn) print("Using existing file:", fixed_labels_segs_list_fn) # Extract UTD with fixed labels input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz") output_npz_fn = path.join( feat_dir, args.language.lower() + ".train.utd_terms.fixed_labels.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD tokens with fixed labels") utils.segments_from_npz(input_npz_fn, fixed_labels_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # Extract UTD with fixed segment intervals input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz") output_npz_fn = path.join( feat_dir, args.language.lower() + ".train.utd_terms.fixed_segs.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD tokens with fixed labels and segment " "intervals") utils.segments_from_npz(input_npz_fn, fixed_segs_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # Extract UTD with fixed labels and segment intervals input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz") output_npz_fn = path.join( feat_dir, args.language.lower() + ".train.utd_terms.fixed_labels_segs.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD tokens with fixed labels and segment " "intervals") utils.segments_from_npz(input_npz_fn, fixed_labels_segs_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn)