def get_churn_prediction_by_days_embeddings(input_seq_dir, output_embds_file): files = [os.path.join(input_seq_dir, e) for e in os.listdir(input_seq_dir)] n_batches = (len(files) + batch_size - 1) // batch_size fout = open(output_embds_file, 'w') for i in tqdm.tqdm(range(n_batches), desc=output_embds_file): batch_files = files[i * batch_size:(i + 1) * batch_size] batch_token_seq, batch_time_seq = [], [] batch_indices = [] for _file in batch_files: with open(_file, 'r', encoding='utf-8') as fin: # 2019-08-01:400347#0#2019-08-01-00:00:00 400616#0#2019-08-01-00:00:00 400177#0#2019-08-01-00:00:01 for l in fin: day = l[:l.index(':')] events = l[l.index(':') + 1:].strip().split(' ') events = events[-(max_seq_len - 2):] token_seq = ["#".join(e.split('#')[:2]) for e in events] time_seq = [ str2timestamp(e.split('#')[-1]) for e in events ] min_t = min(time_seq) time_seq = [t - min_t + 1 for t in time_seq] token_seq = [SOS_SYMBOL] + token_seq + [EOS_SYMBOL] time_seq = [0.0] + time_seq + [0.0] padding = [ PAD_SYMBOL for _ in range(max_seq_len - len(token_seq)) ] token_seq = padding + token_seq time_seq = [0] * len(padding) + time_seq token_seq = tokenizer.convert_tokens_to_ids(token_seq) batch_token_seq.append(token_seq) batch_time_seq.append(time_seq) batch_indices.append("%s,%s" % (os.path.basename(_file), day)) with torch.no_grad(): n_batches_batches = (len(batch_token_seq) + batch_size - 1) // batch_size for j in range(n_batches_batches): _indices = batch_indices[j * batch_size:(j + 1) * batch_size] _input_token_seq = batch_token_seq[j * batch_size:(j + 1) * batch_size] _input_time_seq = batch_time_seq[j * batch_size:(j + 1) * batch_size] outputs = model( torch.tensor(_input_token_seq).to(device), torch.tensor(_input_time_seq).to(device), pos_mask) outputs = outputs.tolist() # role_id, ds, *portrait_features = l.strip().split(',') for idx, output in zip(_indices, outputs): fout.write("%s,%s\n" % (idx, ','.join(map(str, output)))) fout.close()
def get_map_preload_embeddings(input_seq_dirs, output_embds_file): files = [] for _seq_dir in input_seq_dirs: if not os.path.isdir(_seq_dir): continue day = os.path.basename(_seq_dir).split('.')[-1] for e in os.listdir(_seq_dir): files.append((os.path.join(_seq_dir, e), day)) n_batches = (len(files) + batch_size - 1) // batch_size fout = open(output_embds_file, 'w') for i in tqdm.tqdm(range(n_batches), desc=output_embds_file): batch_files = files[i * batch_size:(i + 1) * batch_size] batch_token_seq, batch_time_seq = [], [] batch_indices = [] for _file in batch_files: with open(_file[0], 'r', encoding='utf-8') as fin: events = json.load(fin) events = events[-(max_seq_len - 2):] time_seq = [ str2timestamp(e.split('#')[0].replace(' ', '-')) for e in events ] token_seq = ['#'.join(e.split('#')[-2:]) for e in events] min_t = min(time_seq) time_seq = [t - min_t + 1 for t in time_seq] token_seq = [SOS_SYMBOL] + token_seq + [EOS_SYMBOL] time_seq = [0.0] + time_seq + [0.0] padding = [ PAD_SYMBOL for _ in range(max_seq_len - len(token_seq)) ] token_seq = padding + token_seq time_seq = [0] * len(padding) + time_seq token_seq = tokenizer.convert_tokens_to_ids(token_seq) batch_token_seq.append(token_seq) batch_time_seq.append(time_seq) batch_indices.append("{},{}".format(os.path.basename(_file[0]), _file[1])) with torch.no_grad(): outputs = model( torch.tensor(batch_token_seq).to(device), torch.tensor(batch_time_seq).to(device), pos_mask) outputs = outputs.tolist() for idx, output in zip(batch_indices, outputs): fout.write("%s,%s\n" % (idx, ','.join(map(str, output)))) fout.close()
def get_bot_detection_embeddings(input_seq_dir, output_embds_file): files = [os.path.join(input_seq_dir, e) for e in os.listdir(input_seq_dir)] n_batches = (len(files) + batch_size - 1) // batch_size fout = open(output_embds_file, 'w') for i in tqdm.tqdm(range(n_batches), desc=output_embds_file): batch_files = files[i * batch_size:(i + 1) * batch_size] batch_token_seq, batch_time_seq = [], [] batch_indices = [] for _file in batch_files: with open(_file, 'r', encoding='utf-8') as fin: events = json.load(fin) events = events[-(max_seq_len - 2):] time_seq = [ str2timestamp(e["timestamp"].replace(' ', '-')) for e in events ] token_seq = [ '%s#%s' % (e['log_id'], e.get('design_id', 0)) for e in events ] min_t = min(time_seq) time_seq = [t - min_t + 1 for t in time_seq] token_seq = [SOS_SYMBOL] + token_seq + [EOS_SYMBOL] time_seq = [0.0] + time_seq + [0.0] padding = [ PAD_SYMBOL for _ in range(max_seq_len - len(token_seq)) ] token_seq = padding + token_seq time_seq = [0] * len(padding) + time_seq token_seq = tokenizer.convert_tokens_to_ids(token_seq) batch_token_seq.append(token_seq) batch_time_seq.append(time_seq) batch_indices.append(os.path.basename(_file).replace(':', '-')) with torch.no_grad(): outputs = model( torch.tensor(batch_token_seq).to(device), torch.tensor(batch_time_seq).to(device), pos_mask) outputs = outputs.tolist() for idx, output in zip(batch_indices, outputs): fout.write("%s,%s\n" % (idx, ','.join(map(str, output)))) fout.close()