def get_batch_gen(maxlen1, maxlen2, vocab_fpath, batch_size, shuffle=False): '''Gets training / evaluation mini-batches fpath1: source file path. string. fpath2: target file path. string. maxlen1: source sent maximum length. scalar. maxlen2: target sent maximum length. scalar. vocab_fpath: string. vocabulary file path. batch_size: scalar shuffle: boolean Returns batches num_batches: number of mini-batches num_samples ''' # 根据路径获取数据并进行筛选,sents1 = list[str] original = """ ▁function ( b ) ▁{ ▁var ▁c ▁= ▁b . browser . supports Class List (), ▁a ▁= ▁b . dom ; ▁a . addClass ▁= ▁function ( b , ▁e ) ▁{ ▁if ▁( c ) ▁return ▁b . classList . add ( e ); ▁a . hasClass ( b , ▁e ) ▁|| ▁( b . className ▁+= ▁" ▁" ▁+ ▁e ); ▁}; ▁a . removeClass ▁= ▁function ( a , ▁b ) ▁{ ▁if ▁( c ) ▁return ▁a . classList . remove ( b ); ▁a . className ▁= ▁a . className . replace ( """ sents1 = [original.strip()] sents2 = [original.strip()] # 返回一个 tf.dataset batches = input_fn(sents1, sents2, vocab_fpath, batch_size, shuffle=shuffle) # 根据总数据量和batch_size,计算 batches num_batches = calc_num_batches(len(sents1), batch_size) return batches, num_batches, len(sents1)
def get_batch(fpath1, fpath2, maxlen1, maxlen2, vocab_fpath, batch_size, shuffle=False): '''Gets training / evaluation mini-batches fpath1: source file path. string. fpath2: target file path. string. maxlen1: source sent maximum length. scalar. maxlen2: target sent maximum length. scalar. vocab_fpath: string. vocabulary file path. batch_size: scalar shuffle: boolean Returns batches num_batches: number of mini-batches num_samples ''' sents1, sents2 = load_data(fpath1, fpath2, maxlen1, maxlen2) batches = input_fn(sents1, sents2, vocab_fpath, batch_size, shuffle=shuffle) num_batches = calc_num_batches(len(sents1), batch_size) return batches, num_batches, len(sents1)
def get_batch(fpath, maxlen1, maxlen2, vocab_fpath, batch_size, gpu_nums, shuffle=False): """ Gets training / evaluation mini-batches fpath1: source file path. string. fpath2: target file path. string. maxlen1: source sent maximum length. scalar. maxlen2: target sent maximum length. scalar. vocab_fpath: string. vocabulary file path. batch_size: scalar shuffle: boolean Returns batches num_batches: number of mini-batches num_samples """ questions, evidences, labels = _load_data(fpath, maxlen1, maxlen2) batches = _input_fn(questions, evidences, labels, vocab_fpath, batch_size, gpu_nums, maxlen1, maxlen2, shuffle=shuffle) num_batches = calc_num_batches(len(questions), batch_size * gpu_nums) return batches, num_batches, len(questions)
def get_batch(fpath1, fpath2, maxlen1, maxlen2, vocab_fpath, batch_size, shuffle=False): '''获取training / evaluation mini-batches fpath1: 源文件路径 string. fpath2: 目标文件路径 string. maxlen1: source sent maximum length. scalar. maxlen2: target sent maximum length. scalar. vocab_fpath: string. vocabulary file path. batch_size: scalar shuffle: boolean ''' sents1, sents2 = load_data(fpath1, fpath2, maxlen1, maxlen2) # 利用input_fn()函数返回数据集生成器对象batches batches = input_fn(sents1, sents2, vocab_fpath, batch_size, shuffle=shuffle) num_batches = calc_num_batches( len(sents1), batch_size ) # 根据样本总数和batch_size的大小计算出所需要的batch数目num_batches,以及样本总数len(sents1)。 return batches, num_batches, len(sents1)
def get_batch(csv_path, batch_size, vocabs=vocabs, shuffle=True): df = pd.read_csv(csv_path) epitopes = df.epitope.apply(str).tolist() cdr3s = df.cdr3.apply(str).tolist() batches = input_fn(epitopes, cdr3s, vocabs, batch_size, shuffle=shuffle) num_batches = calc_num_batches(len(epitopes), batch_size) return batches, num_batches, len(epitopes)
def get_batch_for_inference(epitopes, batch_size, vocabs=vocabs, shuffle=True): """ epitopes: list of str """ batches = input_fn(epitopes, epitopes, vocabs, batch_size, shuffle=shuffle) num_batches = calc_num_batches(len(epitopes), batch_size) return batches, num_batches, len(epitopes)
def get_batch(fpath, task_type, input_indices, vocabs, context, batch_size, shuffle=False): """More standarlized, recommended""" instances = load_data(fpath) batches = input_fn(instances, task_type, input_indices, vocabs, context, batch_size, shuffle) num_batches = calc_num_batches(len(instances), batch_size) return batches, num_batches, len(instances)
def get_batch(fpath1, fpath2, maxlen1, maxlen2, vocab_fpath, batch_size, shuffle=False): sents1, sents2 = load_data(fpath1, fpath2, maxlen1, maxlen2) batches = input_fn(sents1, sents2, vocab_fpath, batch_size, shuffle=shuffle) num_batches = calc_num_batches(len(sents1), batch_size) return batches, num_batches, len(sents1)
def get_batch_sim(fpath, maxlen1, maxlen2, vocab_fpath, batch_size, shuffle=False): """Dssm style task""" sents1, sents2, scores = load_data2(fpath, maxlen1, maxlen2) batches = input_fn_sim(sents1, sents2, scores, vocab_fpath, batch_size, shuffle=shuffle) num_batches = calc_num_batches(len(sents1), batch_size) return batches, num_batches, len(sents1)
def get_batch_sup(features, batch_size, shuffle=True): inputs_a, a_lens, related_labels = features instance_len = len(inputs_a) num_batches = calc_num_batches(instance_len, batch_size) if shuffle: indices = np.random.permutation(np.arange(instance_len)) inputs_a = inputs_a[indices] a_lens = a_lens[indices] related_labels = related_labels[indices] for i in range(num_batches): start_id = i * batch_size end_id = min((i + 1) * batch_size, instance_len) yield (inputs_a[start_id:end_id], a_lens[start_id:end_id], related_labels[start_id:end_id])
def get_batch_unsup(features, batch_size, shuffle=True): ori_input, ori_lens, aug_input, aug_lens = features instance_len = len(ori_input) num_batches = calc_num_batches(instance_len, batch_size) if shuffle: indices = np.random.permutation(np.arange(instance_len)) ori_input = ori_input[indices] ori_lens = ori_lens[indices] aug_input = aug_input[indices] aug_lens = aug_lens[indices] for i in range(num_batches): start_id = i * batch_size end_id = min((i + 1) * batch_size, instance_len) yield (ori_input[start_id:end_id], ori_lens[start_id:end_id], aug_input[start_id:end_id], aug_lens[start_id:end_id])
def get_batch(fpath1, fpath2, maxlen1, maxlen2, vocab_fpath, paraphrased_fpath, batch_size, shuffle=False, paraphrase_type=0): sents1, sents2, paraphrased_pairs = load_data(fpath1, fpath2, paraphrased_fpath, maxlen1, maxlen2) batches = input_fn(sents1, sents2, paraphrased_pairs, vocab_fpath, batch_size, shuffle=shuffle, paraphrase_type=paraphrase_type) num_batches = calc_num_batches(len(sents1), batch_size) return batches, num_batches, len(sents1)