def _log(self, msg: str): """Method to log detailed information of the actor's actions""" log(env=self.env, actor_name=self.__class__.__name__, condition=self.condition, msg=msg)
def main(): device = torch.device(f'cuda:{args.gpu}') data_list = load_persona_chat() tokenizer = PolyEncoderTokenizer.from_pretrained() if args.poly: model = PretrainedPolyEncoder.from_pretrained() else: model = PretrainedBiEncoder.from_pretrained() history_size = model.opt['history_size'] text_truncate = model.opt['text_truncate'] model.to(device) model.eval() for param in model.parameters(): param.requires_grad = False log(f'Loading LM model from {args.lm_model_dir}') lm_model = PolyEncoderLM.from_pretrained(checkpoint=args.lm_model_dir) lm_model.to(device) lm_model.eval() for param in lm_model.parameters(): param.requires_grad = False eval_lm_model = SentenceScorer(device) if args.fp16: from apex import amp model, lm_model = amp.initialize([model, lm_model]) for data in data_list: query = '\n'.join(data[0] + data[1][-history_size:]) candidates = data[2] truth = candidates[-1] query_ids = torch.tensor(tokenizer.encode(query, max_length=text_truncate), device=device).unsqueeze(0) candidates = torch.tensor(tokenizer.batch_encode_plus( candidates, pad_to_max_length=True)['input_ids'], device=device) output = model.forward(ctxt_input_ids=query_ids, cand_input_ids=candidates) scores = output[0].squeeze() if args.nature: collision, score = gen_natural_collision( query, truth, model, tokenizer, device, lm_model, eval_lm_model, scores.max()) else: collision, score = gen_aggressive_collision(query, scores.max(), model, tokenizer, device, lm_model) # get the rank of collision scores = scores.cpu().tolist() scores = np.asarray([score] + scores) n = len(scores) ranks = np.empty(n) ranks[np.argsort(-scores)] = np.arange(n) lm_perp = eval_lm_model.perplexity(collision) msg = f'Input={query}\n' \ f'Ground truth response={truth}\n' \ f'Collision={collision}\n' \ f'Collision similarity core={score}\n' \ f'Rank={ranks[0]}\n' \ f'LM perp={lm_perp.item()}\n' log(msg)
def writeCSV(self, df, file): """Escrever CSV. Args: df: Dataframe. file: Nome do arquivo CSV. """ log("INFO", "[writeCSV] Salvando o CSV.") df.repartition(1).write.csv(file, mode='overwrite')
def wordCount(self, wordListDF): """Cria dataframe com a contagem de palavras Args: wordListDF: Dataframe com uma coluna chamada 'word'. Returns: DataFrame: Dataframe contendo 'word' e 'count'. """ log("INFO", "[wordCount] Contando as palavras.") return wordListDF.groupBy('word').count()
def read_file_and_store_on_datalake(self, spark_session, output_base_dir): """ Read a traditional JSON ([{"a":1, "b":2}, {...}]) file and store it into a postgresql database Parameters ---------- spark_session: SparkSession The SparkSession of the application output_base_dir: str The datalake location """ input_filename = self._args[2] file = self._input_data_dir + input_filename output_data_name = self._args[3] output_path = output_base_dir + "/" + output_data_name schema_json = self._args[4] date_ops = json.loads(self._args[5]) if len(self._args) == 6 else None ################################## # ## Parse schema JSON to a Spark schema # ################################## schema = get_schema(schema_json) log(spark_session).info("Schema: " + str(schema)) try: df = spark_session.read.schema(schema).json(file) log(spark_session).info("Number of rows: " + str(df.count())) df = convert_date_using_data_ops_schema(df, date_ops) df = define_date_columns_in_df(df, date_ops) df = change_whitespace_on_columns_by_underscore(df) try: df \ .write \ .format("parquet") \ .partitionBy("year", "month", "day") \ .mode("append") \ .save(output_path) except Exception as e: log(spark_session).error("Error on writing to the data lake... ") log(spark_session).error(e) except Exception as e: log(spark_session).error(e)
def readCSV(self, file): """Ler CSV. Args: file: Nome do arquivo CSV. Returns: DataFrame: Conteúdo do CSV. """ log("INFO", f"[readCSV] Lendo arquivo CSV: {file}.") dfEnd = self.spark.read.csv(file, header=True) return dfEnd
def joinDataFrames(self, df, newRow): """Faz o Join de dois DataFrames. Args: df: Dataframe. Returns: DataFrame: Dataframe completo. """ log("INFO", "[joinDataFrame] Fazendo o Join dos DataFrames.") dfEnd = df.union(newRow) return dfEnd
def removePunctuation(self, column): """Remover pontuação do dataframe Args: column: Coluna para remover as pontuações. Returns: Registros sem as pontuações. """ log("INFO", f"[removePunctuation] Removendo pontuação da coluna: {column}") return trim(lower(regexp_replace(column, '[^\sa-zA-Z0-9]', ''))).alias('value')
def writeCSV(self, df, file): """Escrever CSV. Args: df: Dataframe. file: Nome do arquivo CSV. """ log("INFO", "[writeCSV] Salvando o CSV.") dfResultToCSV = df.withColumn("lista_pedidos", df['lista_pedidos'].cast("string")) dfResultToCSV.repartition(1).write.csv(file, mode='overwrite', header=True)
def readCSV(self, file): """Ler CSV. Args: file: Nome do arquivo CSV. Returns: DataFrame: Conteúdo do CSV. """ log("INFO", f"[readCSV] Lendo arquivo CSV: {file}") dfEnd = self.spark.read.text(file).select( self.removePunctuation(col('value'))) return dfEnd
def main(): device = torch.device(f'cuda:{args.gpu}') target_q_doc, query_scores, bm25_q_doc, best_query_sent, queries = prepare_data_and_scores( args.model_name, args.data_name) model_path = os.path.join(args.model_dir, args.model_name) tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = BertForConcatNextSentencePrediction.from_pretrained(model_path) model.to(device) model.eval() for param in model.parameters(): param.requires_grad = False log(f'Loading LM model from {args.lm_model_dir}') lm_model = BertForLM.from_pretrained(args.lm_model_dir) lm_model.to(device) lm_model.eval() for param in lm_model.parameters(): param.requires_grad = False eval_lm_model = SentenceScorer(device) if args.fp16: from apex import amp model, lm_model = amp.initialize([model, lm_model]) for qid in queries: query = queries[qid] best = best_query_sent[qid] best_score = best[0] best_sent = ' '.join(best[1:]) old_scores = query_scores[qid][::-1] if args.nature: collision, new_score, collision_cands = gen_natural_collision( query, best_sent, model, tokenizer, device, lm_model, best_score, eval_lm_model) else: collision, new_score, collision_cands = gen_aggressive_collision( query, best_sent, model, tokenizer, device, best_score, lm_model) lm_perp = eval_lm_model.perplexity(collision) msg = f'Query={query}\n' \ f'Best true sentences={best_sent}\n' \ f'Best similarity score={best_score}\n' \ f'Collision={collision}\n' \ f'Similarity core={new_score}\n' \ f'LM perp={lm_perp.item()}\n' log(msg) if args.verbose: log('---Rank shifts for less relevant documents---') weighted_new_score = sum(BIRCH_ALPHAS) * new_score for did in bm25_q_doc[qid]: new_score = bm25_q_doc[qid][ did] * BIRCH_GAMMA + weighted_new_score * (1 - BIRCH_GAMMA) old_rank, old_score = target_q_doc[qid][did] new_rank = 1000 - bisect.bisect_left(old_scores, new_score) log(f'Query id={qid}, Doc id={did}, ' f'old score={old_score:.2f}, new score={new_score:.2f}, old rank={old_rank}, new rank={new_rank}' )
def countOrders(self, df): """Calcular a quantidade de todos os pedidos por cliente. Args: df: Dataframe Returns: Dataframe. """ log("INFO", f"[countOrders] Contando a quantidade de pedidos por cliente.") dfEnd = (df.groupBy(col('codigo_cliente')).agg( count('data_pedido').alias('numero_pedidos')).orderBy( "numero_pedidos", ascending=False)) return dfEnd
def perturb_logits( unpert_logits, stepsize=0.01, target_model_wrapper=None, num_iterations=3, kl_scale=0.01, temperature=1.0, device="cuda", verbose=False, logit_mask=0., ): # Generate inital perturbed past grad_accumulator = np.zeros(unpert_logits.shape, dtype=np.float32) perturbation = to_var(grad_accumulator, device=device) optimizer = torch.optim.Adam([perturbation], lr=stepsize) # accumulate perturbations for num_iterations for i in range(num_iterations): optimizer.zero_grad() # Compute hidden using perturbed past logits = unpert_logits * temperature + perturbation + logit_mask probs = torch.softmax(logits / temperature, -1) unpert_probs = torch.softmax(unpert_logits, -1) loss = torch.scalar_tensor(0.0).to(device) loss_list = [] if target_model_wrapper is not None: discrim_loss = target_model_wrapper(probs) if verbose and i % 2 == 0: log(f"Iteration {i + 1}, pplm_discrim_loss: {discrim_loss.data.cpu().numpy()}") loss += discrim_loss loss_list.append(discrim_loss) if kl_scale > 0.0: unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach() correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach() corrected_probs = probs + correction.detach() kl_loss = kl_scale * (corrected_probs * (corrected_probs / unpert_probs).log()).sum() loss += kl_loss # compute gradients loss.backward() optimizer.step() # apply the accumulated perturbations to the past pert_logits = unpert_logits * temperature + perturbation return pert_logits
def createDataFrameWords(self, df): """Criar dataframe com uma palavra por linha. Args: df: Dataframe. Returns: DataFrame: Dataframe contendo as palavras. """ log( "INFO", "[createDataFrameWords] Criando DataFrame com uma palavra por linha." ) dfWords = (df.select(explode(split( df.value, ' ')).alias('word')).where(col('word') != '')) return dfWords
def read_data_from_socket_and_store_on_datalake(self, spark): """ Read a traditional JSON ([{"a":1, "b":2}, {...}]) file and store it into a postgresql database Parameters ---------- spark: SparkSession The SparkSession of the application """ streaming_name = self._args[1] schema_json = self._args[2] date_ops = json.loads(self._args[3]) if len(self._args) == 4 else None output_dir = get_path_by_day(self._output_base_dir, streaming_name) ################################## # ## Parse schema JSON to a Spark schema # ################################## schema = get_schema(schema_json) log(spark).info("Schema: " + str(schema)) try: df = spark \ .readStream \ .format("socket") \ .option("host", "localhost") \ .option("port", 9999) \ .load() df = df \ .select(from_json("value", schema).alias("json_data")) \ .selectExpr("json_data.*") df.printSchema() if date_ops: df = convert_date_using_data_ops_schema(df, date_ops) try: df.writeStream \ .format("parquet") \ .option("path", output_dir) \ .trigger(processingTime=self._streaming_output_interval) \ .start() \ .awaitTermination() except Exception as e: log(spark).error("Error on writing database... ") log(spark).error(e) except Exception as e: log(spark).error(e)
def filterAgeUnderThirty(self, df): """Filtrar os clientes que são menores de 30 anos e compraram na Black Friday. Args: df: Dataframe Returns: Dataframe. """ log( "INFO", f"[filterAgeUnderThirty] Filtrando os clientes com idade inferior a 30 anos." ) dfStaging = df.dropDuplicates(['codigo_cliente']) dfStaging = dfStaging.filter(df.idade < 30) dfEnd = dfStaging.select("codigo_cliente", "idade") return dfEnd
def joinDataframes(self, dfOrders, dfCount, dfOrdersList, dfAge): """Faz o Join de dois DataFrames. Args: df: Dataframe. Returns: DataFrame: Dataframe completo. """ log("INFO", "[joinDataFrame] Fazendo o Join dos DataFrames.") innerJoinAgeOrders = dfOrders.join(dfCount, ["codigo_cliente"], "inner") innerJoinStaging = innerJoinAgeOrders.join(dfOrdersList, ["codigo_cliente"], "inner") innerJoinEnd = innerJoinStaging.join(dfAge, ["codigo_cliente"], "inner") return innerJoinEnd
def main(): device = torch.device(f'cuda:{args.gpu}') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = ExtSummarizer(CONFIG, torch.load(PRESUMM_MODEL_PATH)) model.to(device) model.eval() for param in model.parameters(): param.requires_grad = False if args.fp16: model = amp.initialize(model) eval_lm_model = SentenceScorer(device) lm_model = BertForLM.from_pretrained(args.lm_model_dir) lm_model.to(device) lm_model.eval() for param in lm_model.parameters(): param.requires_grad = False data = load_ext_sum_data() for ex in data: ex = preprocess(ex) src, segs, clss, src_sent_labels, src_txt, tgt_txt = ex if int(len(src_sent_labels) * args.insert_pos) == 0: # too short to insert collision into the article continue truth = [ src_txt[j] for j in range(len(src_sent_labels)) if src_sent_labels[j] == 1 ] truth = ' '.join(truth) if args.nature: collision, score, rank = gen_natural_collision( ex, model, tokenizer, device, lm_model, eval_lm_model) else: collision, score, rank = gen_aggressive_collision( ex, model, tokenizer, device, lm_model) lm_perp = eval_lm_model.perplexity(collision) msg = f'Ground truth summary={truth}\n' \ f'Collision={collision}\n' \ f'Score={score}\n' \ f'Rank={rank}\n' \ f'LM perp={lm_perp.item()}\n' log(msg)
def filterCustomerOrders(self, df): """Filtrar clientes com mais de duas compras nos dias da Black Friday Args: df: Dataframe Returns: Dataframe. """ log( "INFO", f"[filterCustomerOrders] Filtrando clientes que fizeram duas compras nos dias da Black Friday." ) dfStaging = (df.groupBy(col('codigo_cliente')).agg( count('data_pedido').alias('numero_pedidos')).orderBy( "numero_pedidos", ascending=False)) dfEnd = dfStaging.filter(dfStaging.numero_pedidos > 2) return dfEnd
def filterShoppingBlackFriday(self, df): """Filtrar as compras realizadas nas últimas 3 Black Fridays. Args: df: Dataframe Returns: Dataframe. """ log( "INFO", f"[filterShoppingBlackFriday] Filtrando as compras das Black Fridays." ) bf_date = [ "2017-11-24", "2017-11-25", "2017-11-26", "2018-11-23", "2018-11-24", "2018-11-25", "2019-11-29", "2019-11-30", "2019-12-01" ] return df.filter(col("data_pedido_date").isin(bf_date))
def createAgeDateColumns(self, df): """Criar colunas de idade do cliente e data do pedido. Args: df: Dataframe Returns: Dataframe. """ log( "INFO", f"[createAgeDateColumns] Criando colunas de idade e data do pedido." ) return (df.withColumn( "data_pedido_date", date_format( from_unixtime(col('data_pedido')), "yyyy-MM-dd")).withColumn('idade', (months_between( current_date(), col('data_nascimento_cliente')) / 12).cast( IntegerType())))
def createDataFrameLength(self, df, flag): """Cria um DataFrame com o tamanho das palavras e aplica um filtro. Args: df: Dataframe. Returns: DataFrame. """ log( "INFO", "[createDataFrameLength] Criando uma coluna com o tamanho de cada palavra." ) dfStaging = df.withColumn("length", length("word")) if flag == "smaller": dfEnd = dfStaging.filter(length(col('word')) <= 10) elif flag == "bigger": dfEnd = dfStaging.filter(length(col('word')) > 10) else: dfEnd = None return dfEnd
def createOrderList(self, df): """Criando coluna com a lista dos pedidos. A coluna lista dos pedidos é formada por um array de arrays. Com os valores de codigo do pedido e a data do pedido. Args: df: Dataframe Returns: Dataframe. """ log("INFO", f"[createOrderList] Criando a lista dos pedidos.") dfStaging = df.withColumn( "codigo_pedido_data", array( concat(col('codigo_pedido'), lit(', '), col('data_pedido_date')))) dfStaging = dfStaging.drop('data_nascimento_cliente', 'data_pedido', 'codigo_pedido', 'idade') dfEnd = dfStaging.groupBy('codigo_cliente').agg( collect_list(col('codigo_pedido_data')).alias('lista_pedidos')) return dfEnd
def main(): device = torch.device(f'cuda:{args.gpu}') model_dir = os.path.join(args.model_dir, args.task_name.lower()) tokenizer = BertTokenizer.from_pretrained(model_dir) log(f'Loading model from {model_dir}') model = BertForConcatSequenceClassification.from_pretrained(model_dir) model.to(device) model.eval() for param in model.parameters(): param.requires_grad = False eval_lm_model = SentenceScorer(device) lm_model = BertForLM.from_pretrained(args.lm_model_dir) lm_model.to(device) lm_model.eval() for param in lm_model.parameters(): param.requires_grad = False if args.fp16: from apex import amp model, lm_model = amp.initialize([model, lm_model]) log(f'Loading data from {args.task_name.upper()}') data = glue_processors[args.task_name.lower()]().get_dev_examples( model_dir) n = 0 for inputs in data: if inputs.label == '1': n += 1 if args.nature: collision, score = gen_natural_collision( inputs.text_a, inputs.text_b, model, tokenizer, device, lm_model=lm_model, eval_lm_model=eval_lm_model) else: collision, score = gen_aggressive_collision(inputs.text_a, model, tokenizer, device, lm_model=lm_model) lm_perp = eval_lm_model.perplexity(collision) msg = f'Input={inputs.text_a}\n' \ f'Ground truth paraphrase={inputs.text_b}\n' \ f'Collision={collision}\n' \ f'Confidence of being paraphrase={score}\n' \ f'LM perp={lm_perp.item()}\n' log(msg)
def gen_aggressive_collision(inputs_a, inputs_b, model, tokenizer, device, margin=None, lm_model=None): word_embedding = model.get_input_embeddings().weight.detach() if lm_model is not None: lm_word_embedding = lm_model.get_input_embeddings().weight.detach() vocab_size = word_embedding.size(0) input_mask = torch.zeros(vocab_size, device=device) filters = find_filters(inputs_a, model, tokenizer, device, k=args.num_filters) best_ids = get_inputs_filter_ids(inputs_b, tokenizer) input_mask[best_ids] = -1e9 remove_tokens = add_single_plural(inputs_a, tokenizer) if args.verbose: log(','.join(remove_tokens)) remove_ids = tokenizer.convert_tokens_to_ids(remove_tokens) remove_ids.append(tokenizer.vocab['.']) input_mask[remove_ids] = -1e9 num_filters_ids = tokenizer.convert_tokens_to_ids(filters) input_mask[num_filters_ids] = -1e9 sub_mask = get_sub_masks(tokenizer, device) input_ids = tokenizer.encode(inputs_a) input_ids = torch.tensor(input_ids, device=device).unsqueeze(0) # prevent output num_filters neighbor words seq_len = args.seq_len batch_input_ids = torch.cat([input_ids] * args.topk, 0) stopwords_mask = create_constraints(seq_len, tokenizer, device) def relaxed_to_word_embs(x): # convert relaxed inputs to word embedding by softmax attention masked_x = x + input_mask + sub_mask if args.regularize: masked_x += stopwords_mask p = torch.softmax(masked_x / args.stemp, -1) x = torch.mm(p, word_embedding) # add embeddings for period and SEP x = torch.cat([x, word_embedding[tokenizer.sep_token_id].unsqueeze(0)]) return p, x.unsqueeze(0) def get_lm_loss(p): x = torch.mm(p.detach(), lm_word_embedding).unsqueeze(0) return lm_model(inputs_embeds=x, one_hot_labels=p.unsqueeze(0))[0] # some constants sep_tensor = torch.tensor([tokenizer.sep_token_id] * args.topk, device=device) batch_sep_embeds = word_embedding[sep_tensor].unsqueeze(1) labels = torch.ones((1, ), dtype=torch.long, device=device) repetition_penalty = 1.0 best_collision = None best_score = -1e9 prev_score = -1e9 collision_cands = [] var_size = (seq_len, vocab_size) z_i = torch.zeros(*var_size, requires_grad=True, device=device) for it in range(args.max_iter): optimizer = torch.optim.Adam([z_i], lr=args.lr) for j in range(args.perturb_iter): optimizer.zero_grad() # relaxation p_inputs, inputs_embeds = relaxed_to_word_embs(z_i) # forward to BERT with relaxed inputs loss, cls_logits, _ = model(input_ids, inputs_embeds=inputs_embeds, next_sentence_label=labels) if margin is not None: loss += torch.sum(torch.relu(margin - cls_logits[:, 1])) if args.beta > 0.: lm_loss = get_lm_loss(p_inputs) loss = args.beta * lm_loss + (1 - args.beta) * loss loss.backward() optimizer.step() if args.verbose and (j + 1) % 10 == 0: log(f'It{it}-{j + 1}, loss={loss.item()}') # detach to free GPU memory z_i = z_i.detach() _, topk_tokens = torch.topk(z_i, args.topk) probs_i = torch.softmax(z_i / args.stemp, -1).unsqueeze(0).expand( args.topk, seq_len, vocab_size) output_so_far = None # beam search left to right for t in range(seq_len): t_topk_tokens = topk_tokens[t] t_topk_onehot = torch.nn.functional.one_hot( t_topk_tokens, vocab_size).float() next_clf_scores = [] for j in range(args.num_beams): next_beam_scores = torch.zeros(tokenizer.vocab_size, device=device) - 1e9 if output_so_far is None: context = probs_i.clone() else: output_len = output_so_far.shape[1] beam_topk_output = output_so_far[j].unsqueeze(0).expand( args.topk, output_len) beam_topk_output = torch.nn.functional.one_hot( beam_topk_output, vocab_size) context = torch.cat([ beam_topk_output.float(), probs_i[:, output_len:].clone() ], 1) context[:, t] = t_topk_onehot context_embeds = torch.einsum('blv,vh->blh', context, word_embedding) context_embeds = torch.cat([context_embeds, batch_sep_embeds], 1) clf_logits = model(input_ids=batch_input_ids, inputs_embeds=context_embeds)[0] clf_scores = clf_logits[:, 1].detach().float() next_beam_scores.scatter_(0, t_topk_tokens, clf_scores) next_clf_scores.append(next_beam_scores.unsqueeze(0)) next_clf_scores = torch.cat(next_clf_scores, 0) next_scores = next_clf_scores + input_mask + sub_mask if args.regularize: next_scores += stopwords_mask[t] if output_so_far is None: next_scores[1:] = -1e9 if output_so_far is not None and repetition_penalty > 1.0: lm_model.enforce_repetition_penalty_(next_scores, 1, args.num_beams, output_so_far, repetition_penalty) # re-organize to group the beam together # (we are keeping top hypothesis accross beams) next_scores = next_scores.view( 1, args.num_beams * vocab_size) # (batch_size, num_beams * vocab_size) next_scores, next_tokens = torch.topk(next_scores, args.num_beams, dim=1, largest=True, sorted=True) # next batch beam content next_sent_beam = [] for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens[0], next_scores[0])): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size next_sent_beam.append((beam_token_score, token_id, beam_id)) next_batch_beam = next_sent_beam # sanity check / prepare next batch assert len(next_batch_beam) == args.num_beams beam_tokens = torch.tensor([x[1] for x in next_batch_beam], device=device) beam_idx = torch.tensor([x[2] for x in next_batch_beam], device=device) # re-order batch if output_so_far is None: output_so_far = beam_tokens.unsqueeze(1) else: output_so_far = output_so_far[beam_idx, :] output_so_far = torch.cat( [output_so_far, beam_tokens.unsqueeze(1)], dim=-1) pad_output_so_far = torch.cat( [output_so_far, sep_tensor[:args.num_beams].unsqueeze(1)], 1) concat_input_ids = torch.cat( [batch_input_ids[:args.num_beams], pad_output_so_far], 1) token_type_ids = torch.cat([ torch.zeros_like(batch_input_ids[:args.num_beams]), torch.ones_like(pad_output_so_far) ], 1) clf_logits = model(input_ids=concat_input_ids, token_type_ids=token_type_ids)[0] actual_clf_scores = clf_logits[:, 1] sorter = torch.argsort(actual_clf_scores, -1, descending=True) if args.verbose: decoded = [ f'{actual_clf_scores[i].item():.4f}, ' f'{tokenizer.decode(output_so_far[i].cpu().tolist())}' for i in sorter ] log(f'It={it}, margin={margin:.4f}, query={inputs_a} | ' + ' | '.join(decoded)) valid_idx = sorter[0] valid = False for idx in sorter: valid, _ = valid_tokenization(output_so_far[idx], tokenizer) if valid: valid_idx = idx break # re-initialize z_i curr_best = output_so_far[valid_idx] next_z_i = torch.nn.functional.one_hot(curr_best, vocab_size).float() eps = 0.1 next_z_i = (next_z_i * (1 - eps)) + (1 - next_z_i) * eps / (vocab_size - 1) z_i = torch.nn.Parameter(torch.log(next_z_i), True) curr_score = actual_clf_scores[valid_idx].item() if valid and curr_score > best_score: best_score = curr_score best_collision = tokenizer.decode(curr_best.cpu().tolist()) if curr_score <= prev_score: break prev_score = curr_score return best_collision, best_score, collision_cands
def read_file_and_store_on_postgresql(self, spark_session, postgresql_access_dict): """ Read a traditional JSON ([{"a":1, "b":2}, {...}]) file and store it into a postgresql database Parameters ---------- spark_session: SparkSession The SparkSession of the application postgresql_access_dict: dict The dictionary with the data necessary to access the postgresql. The json must contain {"database":..., "username":..., "password":...} """ input_filename = self._args[2] file = self._input_data_dir + input_filename tbl_name = self._args[3] schema_json = self._args[4] date_ops = json.loads(self._args[5]) if len(self._args) == 6 else None ################################## # ## Parse schema JSON to a Spark schema # ################################## schema = get_schema(schema_json) log(spark_session).info("Schema: " + str(schema)) try: rdd = spark_session \ .sparkContext \ .textFile(file) \ .map(lambda x: json.loads(x)) \ .flatMap(lambda x: x) df = spark_session.createDataFrame(rdd, schema) if date_ops: df = convert_date_using_data_ops_schema(df, date_ops) log(spark_session).info("Number of rows: " + str(df.count())) try: df \ .write \ .format("jdbc") \ .option("url", "jdbc:postgresql:" + postgresql_access_dict["database"]) \ .option("dbtable", "public." + tbl_name) \ .option("user", postgresql_access_dict["username"]) \ .option("password", postgresql_access_dict["password"]) \ .mode("overwrite") \ .save() except Exception as e: log(spark_session).error("Error on writing database... ") log(spark_session).error(e) except Exception as e: log(spark_session).error(e)
def gen_natural_collision(inputs_a, inputs_b, model, tokenizer, device, lm_model, margin=None, eval_lm_model=None): input_mask = torch.zeros(tokenizer.vocab_size, device=device) filters = find_filters(inputs_a, model, tokenizer, device, k=args.num_filters) best_ids = get_inputs_filter_ids(inputs_b, tokenizer) input_mask[best_ids] = -1e9 num_filters_ids = tokenizer.convert_tokens_to_ids(filters) input_mask[num_filters_ids] = -1e9 remove_tokens = add_single_plural(inputs_a, tokenizer) if args.verbose: log(','.join(remove_tokens)) remove_ids = tokenizer.convert_tokens_to_ids(remove_tokens) input_mask[remove_ids] = -1e9 input_mask[tokenizer.convert_tokens_to_ids(['.', '@', '='])] = -1e9 unk_ids = tokenizer.encode('<unk>', add_special_tokens=False) input_mask[unk_ids] = -1e9 filter_ids = [ tokenizer.vocab[w] for w in tokenizer.vocab if not w.isalnum() ] first_mask = torch.zeros_like(input_mask) first_mask[filter_ids] = -1e9 collition_init = tokenizer.convert_tokens_to_ids([BOS_TOKEN]) start_idx = 1 num_beams = args.num_beams repetition_penalty = 5.0 curr_len = len(collition_init) # scores for each sentence in the beam beam_scores = torch.zeros((num_beams, ), dtype=torch.float, device=device) beam_scores[1:] = -1e9 output_so_far = torch.tensor([collition_init] * num_beams, device=device) past = None vocab_size = tokenizer.vocab_size topk = args.topk input_ids = tokenizer.encode(inputs_a) input_ids = torch.tensor(input_ids, device=device).unsqueeze(0) batch_input_ids = torch.cat([input_ids] * topk, 0) sep_tensor = torch.tensor([tokenizer.sep_token_id] * topk, device=device) is_first = True word_embedding = model.get_input_embeddings().weight.detach() batch_sep_embeds = word_embedding[sep_tensor].unsqueeze(1) batch_labels = torch.ones((num_beams, ), dtype=torch.long, device=device) def classifier_loss(p, context): context = torch.nn.functional.one_hot(context, len(word_embedding)) one_hot = torch.cat([context.float(), p.unsqueeze(1)], 1) x = torch.einsum('blv,vh->blh', one_hot, word_embedding) # add embeddings for SEP x = torch.cat([x, batch_sep_embeds[:num_beams]], 1) cls_loss = model(batch_input_ids[:num_beams], inputs_embeds=x, next_sentence_label=batch_labels)[0] return cls_loss best_score = -1e9 best_collision = None collision_cands = [] while (curr_len - start_idx) < args.seq_len: model_inputs = lm_model.prepare_inputs_for_generation(output_so_far, past=past) outputs = lm_model(**model_inputs) present = outputs[1] # (batch_size * num_beams, vocab_size) next_token_logits = outputs[0][:, -1, :] lm_scores = torch.log_softmax(next_token_logits, dim=-1) if args.perturb_iter > 0: # perturb internal states of LM def target_model_wrapper(p): return classifier_loss(p, output_so_far.detach()[:, start_idx:]) next_token_logits = perturb_logits( next_token_logits, args.lr, target_model_wrapper, num_iterations=args.perturb_iter, kl_scale=args.kl_scale, temperature=args.stemp, device=device, verbose=args.verbose, logit_mask=input_mask, ) if repetition_penalty > 1.0: lm_model.enforce_repetition_penalty_(next_token_logits, 1, num_beams, output_so_far, repetition_penalty) next_token_logits = next_token_logits / args.stemp # (batch_size * num_beams, vocab_size) next_lm_scores = lm_scores + beam_scores[:, None].expand_as(lm_scores) _, topk_tokens = torch.topk(next_token_logits, topk) # get target model score here next_clf_scores = [] for i in range(num_beams): next_beam_scores = torch.zeros(tokenizer.vocab_size, device=device) - 1e9 if output_so_far.shape[1] > start_idx: curr_beam_topk = output_so_far[i, start_idx:].unsqueeze(0).expand( topk, output_so_far.shape[1] - start_idx) # (topk, curr_len + next_token + sep) curr_beam_topk = torch.cat([ curr_beam_topk, topk_tokens[i].unsqueeze(1), sep_tensor.unsqueeze(1) ], 1) else: curr_beam_topk = torch.cat( [topk_tokens[i].unsqueeze(1), sep_tensor.unsqueeze(1)], 1) concat_input_ids = torch.cat([batch_input_ids, curr_beam_topk], 1) token_type_ids = torch.cat([ torch.zeros_like(batch_input_ids), torch.ones_like(curr_beam_topk), ], 1) clf_logits = model(input_ids=concat_input_ids, token_type_ids=token_type_ids)[0] clf_scores = torch.log_softmax(clf_logits, -1)[:, 1].detach() next_beam_scores.scatter_(0, topk_tokens[i], clf_scores.float()) next_clf_scores.append(next_beam_scores.unsqueeze(0)) next_clf_scores = torch.cat(next_clf_scores, 0) if is_first: next_clf_scores += beam_scores[:, None].expand_as(lm_scores) next_clf_scores += first_mask is_first = False next_scores = ( 1 - args.beta) * next_clf_scores + args.beta * next_lm_scores next_scores += input_mask # re-organize to group the beam together # (we are keeping top hypothesis accross beams) next_scores = next_scores.view(num_beams * vocab_size) next_lm_scores = next_lm_scores.view(num_beams * vocab_size) next_scores, next_tokens = torch.topk(next_scores, num_beams, largest=True, sorted=True) next_lm_scores = next_lm_scores[next_tokens] # next batch beam content next_sent_beam = [] for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens, next_lm_scores)): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size next_sent_beam.append((beam_token_score, token_id, beam_id)) next_batch_beam = next_sent_beam # sanity check / prepare next batch assert len(next_batch_beam) == num_beams beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_tokens = output_so_far.new([x[1] for x in next_batch_beam]) beam_idx = output_so_far.new([x[2] for x in next_batch_beam]) # re-order batch output_so_far = output_so_far[beam_idx, :] output_so_far = torch.cat( [output_so_far, beam_tokens.unsqueeze(1)], dim=-1) # sanity check pad_output_so_far = torch.cat([ output_so_far[:, start_idx:], sep_tensor[:num_beams].unsqueeze(1) ], 1) concat_input_ids = torch.cat( [batch_input_ids[:num_beams], pad_output_so_far], 1) token_type_ids = torch.cat([ torch.zeros_like(batch_input_ids[:num_beams]), torch.ones_like(pad_output_so_far) ], 1) clf_logits = model(input_ids=concat_input_ids, token_type_ids=token_type_ids)[0] actual_clf_scores = clf_logits[:, 1] sorter = torch.argsort(actual_clf_scores, -1, descending=True) if args.verbose: decoded = [ f'{actual_clf_scores[i].item():.4f}, ' f'{tokenizer.decode(output_so_far[i, start_idx:].cpu().tolist())}' for i in sorter ] log(f'Margin={margin if margin else 0:.4f}, query={inputs_a} | ' + ' | '.join(decoded)) if curr_len > args.min_len: valid_idx = sorter[0] valid = False for idx in sorter: valid, _ = valid_tokenization(output_so_far[idx, start_idx:], tokenizer) if valid: valid_idx = idx break curr_score = actual_clf_scores[valid_idx].item() curr_collision = tokenizer.decode( output_so_far[valid_idx, start_idx:].cpu().tolist()) collision_cands.append((curr_score, curr_collision)) if valid and curr_score > best_score: best_score = curr_score best_collision = curr_collision if args.verbose: lm_perp = eval_lm_model.perplexity(curr_collision) log(f'LM perp={lm_perp.item()}') # re-order internal states past = lm_model._reorder_cache(present, beam_idx) # update current length curr_len = curr_len + 1 return best_collision, best_score, collision_cands
def gen_aggressive_collision(ex, model, tokenizer, device, lm_model=None): src, segs, clss, src_sent_labels, src_txt, tgt_txt = ex word_embedding = model.bert.model.get_input_embeddings().weight.detach() if lm_model is not None: lm_word_embedding = lm_model.get_input_embeddings().weight.detach() vocab_size = word_embedding.size(0) src_ids = torch.tensor(src, device=device) src_embeds = word_embedding[src_ids] sub_mask = get_sub_masks(tokenizer, device) input_mask = torch.zeros(vocab_size, device=device) src_tokens = [ w for w in tokenizer.convert_ids_to_tokens(src) if w.isalpha() and w not in STOPWORDS ] input_mask[tokenizer.convert_tokens_to_ids(src_tokens)] = -1e9 seq_len = args.seq_len stopwords_mask = create_constraints(seq_len, tokenizer, device) def relaxed_to_word_embs(x): # convert relaxed inputs to word embedding by softmax attention masked_x = x + input_mask + sub_mask if args.regularize: masked_x += stopwords_mask p = torch.softmax(masked_x / args.stemp, -1) x = torch.mm(p, word_embedding) # add embeddings for period and SEP x = torch.cat([ word_embedding[tokenizer.cls_token_id].unsqueeze(0), x, word_embedding[tokenizer.sep_token_id].unsqueeze(0) ]) return p, x.unsqueeze(0) def get_lm_loss(p): x = torch.mm(p.detach(), lm_word_embedding).unsqueeze(0) return lm_model(inputs_embeds=x, one_hot_labels=p.unsqueeze(0))[0] # some constants sep_tensor = torch.tensor([tokenizer.sep_token_id] * args.topk, device=device) batch_sep_emb = word_embedding[sep_tensor].unsqueeze(1) cls_tensor = torch.tensor([tokenizer.cls_token_id] * args.topk, device=device) batch_cls_emb = word_embedding[cls_tensor].unsqueeze(1) label = int(len(clss) * args.insert_pos) labels = torch.tensor([label], device=device) batch_prefix_ids, batch_prefix_emb, batch_src_ids, batch_src_emb, mask_cls, batch_segs, batch_new_clss = \ get_input_constant(label, seq_len, src_ids, src_embeds, segs, clss, device) prefix_embeds = batch_prefix_emb[0] src_embeds = batch_src_emb[0] type_token_ids = batch_segs[0] new_clss = batch_new_clss[0] loss_fn = torch.nn.CrossEntropyLoss() best_collision = None best_score = -1e9 best_rank = -1 prev_score = -1e9 var_size = (seq_len, vocab_size) z_i = torch.zeros(*var_size, requires_grad=True, device=device) for it in range(args.max_iter): optimizer = torch.optim.Adam([z_i], lr=args.lr) for j in range(args.perturb_iter): optimizer.zero_grad() # relaxation p_inputs, inputs_embeds = relaxed_to_word_embs(z_i) # forward to BERT with relaxed inputs inputs_embeds = torch.cat([ prefix_embeds.unsqueeze(0), inputs_embeds, src_embeds.unsqueeze(0) ], 1) scores = model(None, type_token_ids, new_clss, None, mask_cls, inputs_embeds, output_logits=True) loss = loss_fn(scores, labels) scores = scores.squeeze() loss += torch.max(scores) - scores[label] if args.beta > 0.: lm_loss = get_lm_loss(p_inputs) loss = args.beta * lm_loss + (1 - args.beta) * loss loss.backward() optimizer.step() if args.verbose and (j + 1) % 10 == 0: log(f'It{it}-{j + 1}, loss={loss.item()}') # detach to free GPU memory z_i = z_i.detach() _, topk_tokens = torch.topk(z_i, args.topk) probs_i = torch.softmax(z_i / args.stemp, -1).unsqueeze(0).expand( args.topk, seq_len, vocab_size) output_so_far = None # beam search left to right for t in range(seq_len): t_topk_tokens = topk_tokens[t] t_topk_onehot = torch.nn.functional.one_hot( t_topk_tokens, vocab_size).float() next_clf_scores = [] for j in range(args.num_beams): next_beam_scores = torch.zeros(tokenizer.vocab_size, device=device) - 1e9 if output_so_far is None: context = probs_i.clone() else: output_len = output_so_far.shape[1] beam_topk_output = output_so_far[j].unsqueeze(0).expand( args.topk, output_len) beam_topk_output = torch.nn.functional.one_hot( beam_topk_output, vocab_size) context = torch.cat([ beam_topk_output.float(), probs_i[:, output_len:].clone() ], 1) context[:, t] = t_topk_onehot context_emb = torch.einsum('blv,vh->blh', context, word_embedding) context_emb = torch.cat( [batch_cls_emb, context_emb, batch_sep_emb], 1) inputs_emb = torch.cat( [batch_prefix_emb, context_emb, batch_src_emb], 1) scores = model(None, batch_segs, batch_new_clss, None, mask_cls, inputs_emb, output_logits=True) clf_scores = scores[:, label].detach().float() next_beam_scores.scatter_(0, t_topk_tokens, clf_scores) next_clf_scores.append(next_beam_scores.unsqueeze(0)) next_clf_scores = torch.cat(next_clf_scores, 0) next_scores = next_clf_scores + input_mask + sub_mask if args.regularize: next_scores += stopwords_mask[t] if output_so_far is None: next_scores[1:] = -1e9 # re-organize to group the beam together # (we are keeping top hypothesis accross beams) next_scores = next_scores.view( 1, args.num_beams * vocab_size) # (batch_size, num_beams * vocab_size) next_scores, next_tokens = torch.topk(next_scores, args.num_beams, dim=1, largest=True, sorted=True) # next batch beam content next_sent_beam = [] for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens[0], next_scores[0])): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size next_sent_beam.append((beam_token_score, token_id, beam_id)) next_batch_beam = next_sent_beam # sanity check / prepare next batch assert len(next_batch_beam) == args.num_beams beam_tokens = torch.tensor([x[1] for x in next_batch_beam], device=device) beam_idx = torch.tensor([x[2] for x in next_batch_beam], device=device) # re-order batch if output_so_far is None: output_so_far = beam_tokens.unsqueeze(1) else: output_so_far = output_so_far[beam_idx, :] output_so_far = torch.cat( [output_so_far, beam_tokens.unsqueeze(1)], dim=-1) pad_output_so_far = torch.cat([ cls_tensor[:args.num_beams].unsqueeze(1), output_so_far, sep_tensor[:args.num_beams].unsqueeze(1) ], 1) concat_input_ids = torch.cat([ batch_prefix_ids[:args.num_beams], pad_output_so_far, batch_src_ids[:args.num_beams] ], 1) actual_scores = model.forward(concat_input_ids, batch_segs[:args.num_beams], batch_new_clss[:args.num_beams], None, mask_cls, None).squeeze() actual_clf_scores = actual_scores[:, label].detach() top_scores, top_labels = torch.topk(actual_scores, actual_scores.shape[-1]) sorter = torch.argsort(actual_clf_scores, -1, descending=True) if args.verbose: decoded = [ f'{actual_clf_scores[i].item():.4f}, ' f'{tokenizer.decode(output_so_far[i].cpu().tolist())}' for i in sorter ] log(f'It={it}, margin={top_scores[:, 2].max().item()} | ' + ' | '.join(decoded)) valid_idx = sorter[0] valid = False for idx in sorter: valid, _ = valid_tokenization(output_so_far[idx], tokenizer) if valid: valid_idx = idx break # re-initialize z_i curr_best = output_so_far[valid_idx] next_z_i = torch.nn.functional.one_hot(curr_best, vocab_size).float() eps = 0.1 next_z_i = (next_z_i * (1 - eps)) + (1 - next_z_i) * eps / (vocab_size - 1) z_i = torch.nn.Parameter(torch.log(next_z_i), True) curr_score = actual_clf_scores[valid_idx].item() curr_collision = tokenizer.decode(curr_best.cpu().tolist()) curr_rank = (top_labels[valid_idx] == label).nonzero().squeeze().item() if valid and curr_score > best_score: best_score = curr_score best_collision = curr_collision best_rank = curr_rank if prev_score == curr_score: break prev_score = curr_score return best_collision, best_score, best_rank
def gen_natural_collision(ex, model, tokenizer, device, lm_model, eval_lm_model=None): src, segs, clss, src_sent_labels, src_txt, tgt_txt = ex word_embedding = model.bert.model.get_input_embeddings().weight.detach() collition_init = tokenizer.convert_tokens_to_ids([BOS_TOKEN]) start_idx = 1 num_beams = args.num_beams repetition_penalty = 5.0 curr_len = len(collition_init) # scores for each sentence in the beam beam_scores = torch.zeros((num_beams, ), dtype=torch.float, device=device) beam_scores[1:] = -1e9 output_so_far = torch.tensor([collition_init] * num_beams, device=device) past = None vocab_size = tokenizer.vocab_size topk = args.topk src_ids = torch.tensor(src, device=device) src_embeds = word_embedding[src_ids] sub_mask = get_sub_masks(tokenizer, device) filter_ids = [ tokenizer.vocab[w] for w in tokenizer.vocab if not w.isalnum() ] first_mask = torch.zeros_like(sub_mask) first_mask[filter_ids] = -1e9 input_mask = torch.zeros(vocab_size, device=device) src_tokens = [ w for w in tokenizer.convert_ids_to_tokens(src) if w.isalpha() and w not in STOPWORDS ] input_mask[tokenizer.convert_tokens_to_ids(src_tokens)] = -1e9 input_mask[tokenizer.convert_tokens_to_ids(['.', '@', '='])] = -1e9 unk_ids = tokenizer.encode('<unk>', add_special_tokens=False) input_mask[unk_ids] = -1e9 sep_tensor = torch.tensor([tokenizer.sep_token_id] * topk, device=device) cls_tensor = torch.tensor([tokenizer.cls_token_id] * topk, device=device) is_first = True batch_sep_emb = word_embedding[sep_tensor].unsqueeze(1) batch_cls_emb = word_embedding[cls_tensor].unsqueeze(1) label = int(len(clss) * args.insert_pos) labels = torch.tensor([label] * num_beams, device=device) loss_fn = torch.nn.CrossEntropyLoss() def classifier_loss(p, context, pre_emb, src_emb, type_token_ids, new_clss, mask): context = torch.nn.functional.one_hot(context, len(word_embedding)) one_hot = torch.cat([context.float(), p.unsqueeze(1)], 1) x = torch.einsum('blv,vh->blh', one_hot, word_embedding) # add embeddings for SEP x = torch.cat( [batch_cls_emb[:num_beams], x, batch_sep_emb[:num_beams]], 1) inputs_embeds = torch.cat([pre_emb, x, src_emb], 1) scores = model(None, type_token_ids, new_clss, None, mask, inputs_embeds, output_logits=True) loss = loss_fn(scores, labels) loss += torch.mean(torch.max(scores, 1)[0] - scores[:, label]) return loss best_collision = None best_score = -1e9 best_rank = -1 while curr_len < args.seq_len: seq_len = curr_len - start_idx + 1 batch_prefix_ids, batch_prefix_emb, batch_src_ids, batch_src_emb, mask_cls, batch_segs, batch_new_clss = \ get_input_constant(label, seq_len, src_ids, src_embeds, segs, clss, device) model_inputs = lm_model.prepare_inputs_for_generation(output_so_far, past=past) outputs = lm_model(**model_inputs) present = outputs[1] # (batch_size * num_beams, vocab_size) next_token_logits = outputs[0][:, -1, :] lm_scores = torch.log_softmax(next_token_logits, dim=-1) next_lm_scores = lm_scores + beam_scores[:, None].expand_as(lm_scores) if args.perturb_iter > 0: # perturb internal states of LM def target_model_wrapper(p): return classifier_loss(p, output_so_far.detach()[:, start_idx:], batch_prefix_emb[:num_beams], batch_src_emb[:num_beams], batch_segs[:num_beams], batch_new_clss[:num_beams], mask_cls) next_token_logits = perturb_logits( next_token_logits, args.lr, target_model_wrapper, num_iterations=args.perturb_iter, kl_scale=args.kl_scale, temperature=args.stemp, device=device, verbose=args.verbose, logit_mask=input_mask, ) if repetition_penalty > 1.0: lm_model.enforce_repetition_penalty_(next_token_logits, 1, num_beams, output_so_far, repetition_penalty) next_token_logits = next_token_logits / args.stemp # (batch_size * num_beams, vocab_size) _, topk_tokens = torch.topk(next_token_logits, topk) # get target model score here next_clf_scores = [] for i in range(num_beams): next_beam_scores = torch.zeros(tokenizer.vocab_size, device=device) - 1e9 if output_so_far.shape[1] > start_idx: curr_beam_topk = output_so_far[i, start_idx:].unsqueeze(0).expand( topk, output_so_far.shape[1] - start_idx) # (topk, curr_len + next_token + sep) curr_beam_topk = torch.cat([ cls_tensor.unsqueeze(1), curr_beam_topk, topk_tokens[i].unsqueeze(1), sep_tensor.unsqueeze(1) ], 1) else: curr_beam_topk = torch.cat([ cls_tensor.unsqueeze(1), topk_tokens[i].unsqueeze(1), sep_tensor.unsqueeze(1) ], 1) concat_input_ids = torch.cat( [batch_prefix_ids, curr_beam_topk, batch_src_ids], 1) scores = model(concat_input_ids, batch_segs, batch_new_clss, None, mask_cls, None) clf_scores = torch.log_softmax(scores, -1)[:, label].detach() next_beam_scores.scatter_(0, topk_tokens[i], clf_scores) next_clf_scores.append(next_beam_scores.unsqueeze(0)) next_clf_scores = torch.cat(next_clf_scores, 0) if is_first: next_clf_scores += beam_scores[:, None].expand_as(lm_scores) next_clf_scores += first_mask is_first = False next_scores = ( 1 - args.beta) * next_clf_scores + args.beta * next_lm_scores next_scores += input_mask # re-organize to group the beam together # (we are keeping top hypothesis accross beams) next_scores = next_scores.view(num_beams * vocab_size) next_lm_scores = next_lm_scores.view(num_beams * vocab_size) next_scores, next_tokens = torch.topk(next_scores, num_beams, largest=True, sorted=True) next_lm_scores = next_lm_scores[next_tokens] # next batch beam content next_sent_beam = [] for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens, next_lm_scores)): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size next_sent_beam.append((beam_token_score, token_id, beam_id)) next_batch_beam = next_sent_beam # sanity check / prepare next batch assert len(next_batch_beam) == num_beams beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_tokens = output_so_far.new([x[1] for x in next_batch_beam]) beam_idx = output_so_far.new([x[2] for x in next_batch_beam]) # re-order batch output_so_far = output_so_far[beam_idx, :] output_so_far = torch.cat( [output_so_far, beam_tokens.unsqueeze(1)], dim=-1) # sanity check pad_output_so_far = torch.cat([ cls_tensor[:num_beams].unsqueeze(1), output_so_far[:, start_idx:], sep_tensor[:num_beams].unsqueeze(1) ], 1) concat_input_ids = torch.cat([ batch_prefix_ids[:num_beams], pad_output_so_far, batch_src_ids[:num_beams] ], 1) actual_scores = model.forward(concat_input_ids, batch_segs[:num_beams], batch_new_clss[:num_beams], None, mask_cls, None) top_scores, top_labels = torch.topk(actual_scores, actual_scores.shape[-1]) actual_clf_scores = actual_scores[:, label].detach() sorter = torch.argsort(actual_clf_scores, -1, descending=True) if args.verbose: decoded = [ f'{actual_clf_scores[i].item():.4f}, ' f'{tokenizer.decode(output_so_far[i, start_idx:].cpu().tolist())}' for i in sorter ] log(f'Margin={top_scores[:, 2].max().item()} | ' + ' | '.join(decoded)) # re-order internal states past = lm_model._reorder_cache(present, beam_idx) # update current length curr_len = curr_len + 1 if curr_len > args.min_len: valid_idx = sorter[0] valid = False for idx in sorter: valid, _ = valid_tokenization(output_so_far[idx, start_idx:], tokenizer) if valid: valid_idx = idx break curr_score = actual_clf_scores[valid_idx].item() curr_collision = tokenizer.decode( output_so_far[valid_idx, start_idx:].cpu().tolist()) curr_rank = ( top_labels[valid_idx] == label).nonzero().squeeze().item() if valid and curr_score > best_score: best_score = curr_score best_collision = curr_collision best_rank = curr_rank if args.verbose: lm_perp = eval_lm_model.perplexity(curr_collision) log(f'LM perp={lm_perp.item()}') return best_collision, best_score, best_rank