Esempio n. 1
0
def get_churn_prediction_by_days_embeddings(input_seq_dir, output_embds_file):
    files = [os.path.join(input_seq_dir, e) for e in os.listdir(input_seq_dir)]
    n_batches = (len(files) + batch_size - 1) // batch_size

    fout = open(output_embds_file, 'w')

    for i in tqdm.tqdm(range(n_batches), desc=output_embds_file):
        batch_files = files[i * batch_size:(i + 1) * batch_size]
        batch_token_seq, batch_time_seq = [], []
        batch_indices = []
        for _file in batch_files:
            with open(_file, 'r', encoding='utf-8') as fin:
                # 2019-08-01:400347#0#2019-08-01-00:00:00 400616#0#2019-08-01-00:00:00 400177#0#2019-08-01-00:00:01
                for l in fin:
                    day = l[:l.index(':')]
                    events = l[l.index(':') + 1:].strip().split(' ')
                    events = events[-(max_seq_len - 2):]

                    token_seq = ["#".join(e.split('#')[:2]) for e in events]
                    time_seq = [
                        str2timestamp(e.split('#')[-1]) for e in events
                    ]
                    min_t = min(time_seq)
                    time_seq = [t - min_t + 1 for t in time_seq]

                    token_seq = [SOS_SYMBOL] + token_seq + [EOS_SYMBOL]
                    time_seq = [0.0] + time_seq + [0.0]
                    padding = [
                        PAD_SYMBOL for _ in range(max_seq_len - len(token_seq))
                    ]
                    token_seq = padding + token_seq
                    time_seq = [0] * len(padding) + time_seq

                    token_seq = tokenizer.convert_tokens_to_ids(token_seq)

                    batch_token_seq.append(token_seq)
                    batch_time_seq.append(time_seq)
                    batch_indices.append("%s,%s" %
                                         (os.path.basename(_file), day))

        with torch.no_grad():
            n_batches_batches = (len(batch_token_seq) + batch_size -
                                 1) // batch_size
            for j in range(n_batches_batches):
                _indices = batch_indices[j * batch_size:(j + 1) * batch_size]
                _input_token_seq = batch_token_seq[j * batch_size:(j + 1) *
                                                   batch_size]
                _input_time_seq = batch_time_seq[j * batch_size:(j + 1) *
                                                 batch_size]
                outputs = model(
                    torch.tensor(_input_token_seq).to(device),
                    torch.tensor(_input_time_seq).to(device), pos_mask)
                outputs = outputs.tolist()
                # role_id, ds, *portrait_features = l.strip().split(',')
                for idx, output in zip(_indices, outputs):
                    fout.write("%s,%s\n" % (idx, ','.join(map(str, output))))
    fout.close()
Esempio n. 2
0
def get_map_preload_embeddings(input_seq_dirs, output_embds_file):
    files = []
    for _seq_dir in input_seq_dirs:
        if not os.path.isdir(_seq_dir):
            continue
        day = os.path.basename(_seq_dir).split('.')[-1]
        for e in os.listdir(_seq_dir):
            files.append((os.path.join(_seq_dir, e), day))

    n_batches = (len(files) + batch_size - 1) // batch_size

    fout = open(output_embds_file, 'w')
    for i in tqdm.tqdm(range(n_batches), desc=output_embds_file):
        batch_files = files[i * batch_size:(i + 1) * batch_size]
        batch_token_seq, batch_time_seq = [], []
        batch_indices = []
        for _file in batch_files:
            with open(_file[0], 'r', encoding='utf-8') as fin:
                events = json.load(fin)
                events = events[-(max_seq_len - 2):]

                time_seq = [
                    str2timestamp(e.split('#')[0].replace(' ', '-'))
                    for e in events
                ]
                token_seq = ['#'.join(e.split('#')[-2:]) for e in events]
                min_t = min(time_seq)
                time_seq = [t - min_t + 1 for t in time_seq]

                token_seq = [SOS_SYMBOL] + token_seq + [EOS_SYMBOL]
                time_seq = [0.0] + time_seq + [0.0]
                padding = [
                    PAD_SYMBOL for _ in range(max_seq_len - len(token_seq))
                ]
                token_seq = padding + token_seq
                time_seq = [0] * len(padding) + time_seq

                token_seq = tokenizer.convert_tokens_to_ids(token_seq)

                batch_token_seq.append(token_seq)
                batch_time_seq.append(time_seq)
                batch_indices.append("{},{}".format(os.path.basename(_file[0]),
                                                    _file[1]))

        with torch.no_grad():
            outputs = model(
                torch.tensor(batch_token_seq).to(device),
                torch.tensor(batch_time_seq).to(device), pos_mask)
            outputs = outputs.tolist()
            for idx, output in zip(batch_indices, outputs):
                fout.write("%s,%s\n" % (idx, ','.join(map(str, output))))
    fout.close()
Esempio n. 3
0
def get_bot_detection_embeddings(input_seq_dir, output_embds_file):
    files = [os.path.join(input_seq_dir, e) for e in os.listdir(input_seq_dir)]
    n_batches = (len(files) + batch_size - 1) // batch_size

    fout = open(output_embds_file, 'w')

    for i in tqdm.tqdm(range(n_batches), desc=output_embds_file):
        batch_files = files[i * batch_size:(i + 1) * batch_size]
        batch_token_seq, batch_time_seq = [], []
        batch_indices = []
        for _file in batch_files:
            with open(_file, 'r', encoding='utf-8') as fin:
                events = json.load(fin)
                events = events[-(max_seq_len - 2):]

                time_seq = [
                    str2timestamp(e["timestamp"].replace(' ', '-'))
                    for e in events
                ]
                token_seq = [
                    '%s#%s' % (e['log_id'], e.get('design_id', 0))
                    for e in events
                ]
                min_t = min(time_seq)
                time_seq = [t - min_t + 1 for t in time_seq]

                token_seq = [SOS_SYMBOL] + token_seq + [EOS_SYMBOL]
                time_seq = [0.0] + time_seq + [0.0]
                padding = [
                    PAD_SYMBOL for _ in range(max_seq_len - len(token_seq))
                ]
                token_seq = padding + token_seq
                time_seq = [0] * len(padding) + time_seq

                token_seq = tokenizer.convert_tokens_to_ids(token_seq)

                batch_token_seq.append(token_seq)
                batch_time_seq.append(time_seq)
                batch_indices.append(os.path.basename(_file).replace(':', '-'))
        with torch.no_grad():
            outputs = model(
                torch.tensor(batch_token_seq).to(device),
                torch.tensor(batch_time_seq).to(device), pos_mask)
            outputs = outputs.tolist()
            for idx, output in zip(batch_indices, outputs):
                fout.write("%s,%s\n" % (idx, ','.join(map(str, output))))

    fout.close()