Ejemplo n.º 1
0
    def __init__(self, lang, pool='pool', max_len=64, lang_code=250004):
        self.context = read_pkl(f'dataset/ckgc/{lang}/context.pkl')
        self.response = read_pkl(f'dataset/ckgc/{lang}/response.pkl')
        self.knowledge = read_pkl(f'dataset/ckgc/{lang}/knowledge.pkl')
        self.pool = [[int(item) for item in line[1:-1].split(',')]
                     for line in read_file(pool)]

        self.max_len = max_len
        self.lang_code = lang_code
Ejemplo n.º 2
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')

    parser = argparse.ArgumentParser()
    parser.add_argument('-dialog', type=str)
    parser.add_argument('-dialog2', type=str)
    parser.add_argument('-k', type=str)
    parser.add_argument('-pool', type=str)
    parser.add_argument('-m', type=int)
    parser.add_argument('-save_path', type=str)
    parser.add_argument('--bc_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--pool_size', type=int, default=10)
    parser.add_argument('--max_len', type=int, default=64)
    parser.add_argument('--language', type=str, default='en')
    parser.add_argument('--pt_path', type=str, default='none')
    args = parser.parse_args()

    dialog_path = args.dialog
    dialog2_path = args.dialog2
    knowledge_path = args.k
    pool_path = args.pool
    max_step = args.m
    batch_size = args.bc_size
    lr = args.lr
    pool_size = args.pool_size
    max_len = args.max_len
    lang_code = mbart_lang_to_id[args.language]
    save_path = args.save_path
    language = args.language

    if knowledge_path != 'redis':
        knowledge = []
        for i in range(200):
            if os.path.exists(f'{knowledge_path}/{i}.pkl'):
                knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl'))
    else:
        knowledge = knowledge_path
    knowledge_pool = read_pkl(pool_path)

    dataset = DuoData(read_pkl(f'{dialog_path}/context.pkl'),
                      read_pkl(f'{dialog_path}/response.pkl'),
                      read_pkl(f'{dialog2_path}/context.pkl'),
                      read_pkl(f'{dialog2_path}/context.pkl'),
                      knowledge_pool, pool_size=pool_size, knowledge=knowledge, order=None,
                      max_len=max_len, lang_code=lang_code, curriculum=max_step)

    test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt',
                                max_len=max_len, lang_code=lang_code)

    tokenizer = get_tokenizer('mbart')
    tokenizer.lang_code_to_id = mbart_lang_to_id

    logging.info('Build generator')
    generator = Generator()
    if torch.cuda.is_available():
        generator = generator.cuda()

    optimizer = AdamW(generator.parameters(), lr)
    pretrained_path = args.pt_path
    if os.path.exists(pretrained_path):
        logging.info(f'Load pretrained model from {pretrained_path}')
        generator.load_state_dict({k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items()})

    cur_step = 0
    while cur_step < max_step:
        dataset.set_offset(cur_step)
        logging.info(f'Training step {cur_step} / max step {max_step}')
        # train_generator(generator, optimizer, dataset,
        #                 pad_idx=1, batch_size=batch_size, step=10)
        cur_step += 10 * batch_size
        predict, true = test_generator(generator, test_dataset, language, tokenizer,
                                       pad_idx=1, batch_size=batch_size, epoch=0, word_mask=None)
        logging.info(eval_all(predict, true))
        write_file(predict, f'{save_path}/predict/{cur_step}.txt')
        torch.save(generator.state_dict(), f'{save_path}/generator/{cur_step}.pt')
Ejemplo n.º 3
0
#!/usr/bin/env python3
import numpy as np

from utils.io import read_pkl, save_pkl

if __name__ == '__main__':

    loc = read_pkl('tmp/location.pkl')
    user_miss = read_pkl('tmp/user_miss_pair.pkl')

    user_miss_loc = {}

    with open('raw/checkins_missing.txt', 'r') as f:
        for line in f:
            user, checkins = line.rstrip('\n').split(':')

            checkins = checkins.split(',')
            checkins = [(int(checkins[i]), checkins[i + 1])
                        for i in range(0, len(checkins), 2)]
            # checkins = [el for el in checkins if el[1] == '?' or loc[el[1]]['country'] == 'US']

            for i, checkin in enumerate(checkins):
                if checkin[1] != '?':
                    continue

                if user not in user_miss_loc:
                    user_miss_loc[user] = []

                if i != 0 and checkins[i - 1][1] != '?':
                    user_miss_loc[user].append(
                        (loc[checkins[i - 1][1]]['lat'],
Ejemplo n.º 4
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format=
        '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s'
    )

    parser = argparse.ArgumentParser()
    parser.add_argument('-dialog', type=str)
    parser.add_argument('-k', type=str)
    parser.add_argument('-pool', type=str)
    parser.add_argument('-save_path', type=str)
    parser.add_argument('--bc_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--pool_size', type=int, default=1)
    parser.add_argument('--max_len', type=int, default=64)
    parser.add_argument('--language', type=str, default='en')
    parser.add_argument('--pt_path', type=str, default='none')
    parser.add_argument('--dist', type=int, default=1)
    args = parser.parse_args()

    dialog_path = args.dialog
    knowledge_path = args.k
    pool_path = args.pool
    batch_size = args.bc_size
    lr = args.lr
    pool_size = args.pool_size
    max_len = args.max_len
    lang_code = mbart_lang_to_id[args.language]
    distributed = args.dist
    save_path = args.save_path
    language = args.language

    if distributed:
        dist_init()
    local_rank = dist.get_rank() if distributed else 0

    if knowledge_path != 'redis':
        knowledge = []
        for i in range(200):
            if os.path.exists(f'{knowledge_path}/{i}.pkl'):
                knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl'))
    else:
        knowledge = knowledge_path
    knowledge_pool = read_pkl(pool_path)

    dataset = Data(read_pkl(f'{dialog_path}/context.pkl'),
                   read_pkl(f'{dialog_path}/response.pkl'),
                   knowledge_pool,
                   pool_size=pool_size,
                   knowledge=knowledge,
                   order=None,
                   max_len=max_len,
                   lang_code=lang_code)
    test_dataset = CKGCTestData(args.language,
                                pool=f'dataset/ckgc/{args.language}/pool.txt',
                                max_len=max_len,
                                lang_code=lang_code)

    tokenizer = get_tokenizer('mbart')
    tokenizer.lang_code_to_id = mbart_lang_to_id

    logging.info('Build generator')
    generator = Generator()
    if torch.cuda.is_available():
        generator = generator.cuda()
    if distributed:
        generator = torch.nn.parallel.DistributedDataParallel(
            generator,
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True)
    optimizer = AdamW(generator.parameters(), lr)
    pretrained_path = args.pt_path
    if os.path.exists(pretrained_path):
        logging.info(f'Load pretrained model from {pretrained_path}')
        if distributed:
            dist.barrier()
            map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()}
            generator.load_state_dict(
                torch.load(pretrained_path, map_location=map_location))
            dist.barrier()
        else:
            generator.load_state_dict({
                k.replace("module.", ""): v
                for k, v in torch.load(pretrained_path).items()
            })

    for epoch in range(100):
        if os.path.exists(f'{save_path}/generator/{epoch}.pt'):
            if distributed:
                dist.barrier()
                map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()}
                generator.load_state_dict(
                    torch.load(f'{save_path}/generator/{epoch}.pt',
                               map_location=map_location))
                dist.barrier()
            else:
                generator.load_state_dict({
                    k.replace("module.", ""): v
                    for k, v in torch.load(save_path + f'_{epoch}.pt').items()
                })
            continue

        if distributed:
            dist.barrier()
        logging.info(f'Training epoch {epoch}')
        train_generator(generator,
                        optimizer,
                        dataset,
                        pad_idx=1,
                        batch_size=batch_size,
                        epoch=epoch,
                        distributed=distributed)

        if distributed:
            dist.barrier()
        if local_rank == 0:
            predict, true = test_generator(generator,
                                           test_dataset,
                                           language,
                                           tokenizer,
                                           pad_idx=1,
                                           batch_size=batch_size,
                                           epoch=epoch,
                                           word_mask=None)
            logging.info(eval_all(predict, true))
            write_file(predict, f'{save_path}/predict/{epoch}.txt')
            torch.save(generator.state_dict(),
                       f'{save_path}/generator/{epoch}.pt')
        if distributed:
            dist.barrier()
Ejemplo n.º 5
0
#!/usr/bin/env python3
import numpy as np
from sklearn.preprocessing import normalize

from utils.io import read_pkl, save_pkl

if __name__ == '__main__':

    user_checkins = read_pkl('tmp/user_checkins.pkl')
    loc_db = read_pkl('tmp/location.pkl')
    nodes = read_pkl('tmp/nodes.pkl')
    node_features = read_pkl('tmp/features.pkl')

    for i, node in enumerate(nodes):
        if node[-1] != '?':
            continue
        if np.sum(node_features[i][24:]) > 0:
            continue

        user = node[:-2]
        group_features = np.zeros((6, 1))

        for checkin in user_checkins[user]:
            if checkin in loc_db:
                g = loc_db[checkin]['group']
                group_features[g][0] += 1

        group_features = normalize(group_features, axis=0)

        for j in range(6):
            node_features[i][j + 24] = group_features[j]
Ejemplo n.º 6
0
#!/usr/bin/env python3
import numpy as np

from sklearn.cluster import KMeans

from utils.io import read_pkl, distance

if __name__ == '__main__':

    loc_db = read_pkl('tmp/location.pkl')
    loc_in_checkins = {}

    with open('raw/checkins_missing.txt', 'r') as f:
        for line in f:
            user, checkins = line.rstrip('\n').split(':')

            checkins = checkins.split(',')
            checkins = [(int(checkins[i]), checkins[i + 1])
                        for i in range(0, len(checkins), 2)]
            checkins = [
                el for el in checkins
                if el[1] != '?' and loc_db[el[1]]['country'] == 'US'
            ]

            for checkin in checkins:
                if checkin[1] not in loc_in_checkins:
                    loc_in_checkins[checkin[1]] = [
                        loc_db[checkin[1]]['lat'], loc_db[checkin[1]]['lon']
                    ]

    candidate = read_pkl('tmp/candidate.pkl')
Ejemplo n.º 7
0
def main():
    logging.basicConfig(
        level=logging.INFO,
        format=
        '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s'
    )

    parser = argparse.ArgumentParser()
    parser.add_argument('-i', type=str)
    parser.add_argument('-o', type=str)
    parser.add_argument('-t', type=str)
    parser.add_argument('-m', type=str)
    parser.add_argument('--p', type=int, default=1)
    parser.add_argument('--redis', type=int, default=1)
    args = parser.parse_args()

    input_path = args.i
    output_path = args.o
    task = args.t
    processes = args.p
    method = args.m
    use_redis = args.redis

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    logging.info(f'Tokenize data, processes={processes}')

    if task == 'wiki':
        pool = multiprocessing.Pool(processes=processes)
        results = []
        file_num = len(all_file(input_path))
        step = file_num // processes
        for i in range(0, file_num, step):
            results.append(
                pool.apply_async(tokenize_wiki,
                                 (i, step, method, input_path, output_path)))
        pool.close()
        pool.join()

        if use_redis:
            import redis
            logging.info('Now build redis')
            data = []
            for i in range(1000):
                if os.path.exists(f'{output_path}/{i}.pkl'):
                    batch = read_pkl(f'{output_path}/{i}.pkl')
                    data.extend(batch)

            r = redis.StrictRedis(host='localhost', port=6379, db=0)
            pipe = r.pipeline()

            step = len(data) // 10
            for j, line in enumerate(data):
                key = str(j)
                value = pickle.dumps(line)
                pipe.set(key, value)
                if j % step == 0 and j != 0:
                    # print(j / len(data), 'execute')
                    pipe.execute()
            pipe.execute()
            # print('final execute done')
            # print('DONE!')

    else:
        context = [
            line[:-1].lower()
            for line in open(f'{input_path}/context.txt', encoding='utf-8')
        ]
        context_ids = do_multiprocessing(tokenize, context, processes)
        write_pkl(context_ids, f'{output_path}/context.pkl')

        response = [
            line[:-1].lower()
            for line in open(f'{input_path}/response.txt', encoding='utf-8')
        ]
        response_ids = do_multiprocessing(tokenize, response, processes)
        write_pkl(response_ids, f'{output_path}/response.pkl')

        if os.path.exists(f'{input_path}/knowledge.txt'):
            knowledge = [
                line[:-1].lower()
                for line in open(f'{input_path}/knowledge.txt',
                                 encoding='utf-8')
            ]
            knowledge_ids = do_multiprocessing(tokenize, knowledge, processes)
            write_pkl(knowledge_ids, f'{output_path}/knowledge.pkl')
Ejemplo n.º 8
0
def get_test_mask():
    u_m_pair = read_pkl('tmp/user_miss_pair.pkl')
    nodes = read_pkl('tmp/nodes.pkl')

    return [nodes.index(el) for el in u_m_pair]
Ejemplo n.º 9
0

def get_test_mask():
    u_m_pair = read_pkl('tmp/user_miss_pair.pkl')
    nodes = read_pkl('tmp/nodes.pkl')

    return [nodes.index(el) for el in u_m_pair]


if __name__ == '__main__':
    root_path = sys.argv[1]

    if not os.path.isdir(root_path):
        os.makedirs(root_path)

    node_features = read_pkl('tmp/features.pkl')
    node_labels = read_pkl('tmp/labels.pkl')
    train_mask = read_pkl('tmp/train_mask.pkl')
    adj_matrix = sparse.load_npz('tmp/graph.npz')

    masks = get_k_fold_mask(idx_list=train_mask, folds=5)

    perf = []

    for i in range(5):
        model_path = root_path + '/' + str(i)
        os.makedirs(model_path)

        train_mask = [np.array(masks[j]) for j in range(5) if j != i]
        train_mask = np.concatenate(train_mask, axis=0)
Ejemplo n.º 10
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format=
        '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s'
    )

    parser = argparse.ArgumentParser()
    parser.add_argument('-q', type=str)
    parser.add_argument('-d', type=str)
    parser.add_argument('-pool', type=str)
    parser.add_argument('-save_path', type=str)
    parser.add_argument('--bc_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--pool_size', type=int, default=10)
    parser.add_argument('--max_len', type=int, default=64)
    parser.add_argument('--language', type=str, default='en')
    parser.add_argument('--pt_path', type=str, default='none')
    parser.add_argument('--dist', type=int, default=1)
    args = parser.parse_args()

    query_path = args.q
    document_path = args.d
    pool_path = args.pool
    batch_size = args.bc_size
    lr = args.lr
    pool_size = args.pool_size
    max_len = args.max_len
    lang_code = mbart_lang_to_id[args.language]
    distributed = args.dist
    save_path = args.save_path

    if distributed:
        dist_init()
    local_rank = dist.get_rank() if distributed else 0

    logging.info(
        f'Load query from {query_path} and document from {document_path}')

    query = read_pkl(query_path)
    if document_path != 'redis':
        document = []
        for i in range(200):
            if os.path.exists(f'{document_path}/{i}.pkl'):
                document.extend(read_pkl(f'{document_path}/{i}.pkl'))
    else:
        document = document_path
    knowledge_pool = read_pkl(pool_path)

    dataset = Data(query,
                   query,
                   knowledge_pool,
                   pool_size=pool_size,
                   knowledge=document,
                   order=None,
                   max_len=max_len,
                   lang_code=lang_code)
    test_dataset = CKGCTestData(args.language,
                                pool=f'dataset/ckgc/{args.language}/pool.txt',
                                max_len=max_len,
                                lang_code=lang_code)

    logging.info('Build retriever')
    retriever = Retriever()
    if torch.cuda.is_available():
        retriever = retriever.cuda()
    if distributed:
        retriever = torch.nn.parallel.DistributedDataParallel(
            retriever,
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True)
    optimizer = AdamW(retriever.parameters(), lr)
    pretrained_path = args.pt_path
    if os.path.exists(pretrained_path):
        logging.info(f'Load pretrained model from {pretrained_path}')
        if distributed:
            dist.barrier()
            map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()}
            retriever.load_state_dict(
                torch.load(pretrained_path, map_location=map_location))
            dist.barrier()
        else:
            retriever.load_state_dict({
                k.replace("module.", ""): v
                for k, v in torch.load(pretrained_path).items()
            })

    for epoch in range(100):
        if os.path.exists(f'{save_path}/retriever/{epoch}.pt'):
            if distributed:
                dist.barrier()
                map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()}
                retriever.load_state_dict(
                    torch.load(f'{save_path}/retriever/{epoch}.pt',
                               map_location=map_location))
                dist.barrier()
            else:
                retriever.load_state_dict({
                    k.replace("module.", ""): v
                    for k, v in torch.load(save_path + f'_{epoch}.pt').items()
                })
            continue

        if distributed:
            dist.barrier()
        logging.info(f'Training epoch {epoch}')
        train_retriever(retriever,
                        optimizer,
                        dataset,
                        pad_idx=1,
                        batch_size=batch_size,
                        epoch=epoch,
                        distributed=distributed)

        if distributed:
            dist.barrier()
        if local_rank == 0:
            ranks = test_retriever(retriever,
                                   test_dataset,
                                   pad_idx=1,
                                   batch_size=batch_size,
                                   epoch=epoch)
            write_file(ranks, f'{save_path}/ranks/{epoch}.txt')
            torch.save(retriever.state_dict(),
                       f'{save_path}/retriever/{epoch}.pt')
        if distributed:
            dist.barrier()
Ejemplo n.º 11
0
#!/usr/bin/env python3
from pprint import pprint

import numpy as np
from scipy import sparse

from utils.io import read_pkl, save_pkl

loc_db = read_pkl('tmp/location.pkl')
candidate = read_pkl('tmp/candidate.pkl')
nodes = read_pkl('tmp/nodes.pkl')
tag2class = read_pkl('tmp/tag2class.pkl')


def get_train_class_weight():
    counter = {}

    for node in nodes:
        if node[-1] == '?':
            continue

        c = loc_db[node]['tag']

        if c not in counter:
            counter[c] = 0

        counter[c] += 1

    return counter

Ejemplo n.º 12
0
#!/usr/bin/env python3
import sys
import os

import numpy as np
from sklearn.metrics import accuracy_score

from utils.tfpkg.models import Evaluator
from utils.io import read_pkl
from utils.location import distance

model_path = sys.argv[1]

nodes = read_pkl('tmp/nodes.pkl')
loc_db = read_pkl('tmp/location.pkl')
candidate = read_pkl('tmp/candidate.pkl')
node_features = read_pkl('tmp/features.pkl')
node_labels = read_pkl('tmp/labels.pkl')
user_checkins = read_pkl('tmp/user_checkins.pkl')
user_miss_loc = read_pkl('tmp/user_miss_loc.pkl')
u_m_pair = read_pkl('tmp/user_miss_pair.pkl')
categorical = read_pkl('tmp/categorical.pkl')


def top_k_accuracy(y_true, y_pred, k):
    total = y_true.shape[0]
    p = 0

    top_k_indices = np.argsort(y_pred, axis=1)[:, -k:]
    ground_truth = np.argmax(y_true, axis=1)
Ejemplo n.º 13
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')

    parser = argparse.ArgumentParser()
    parser.add_argument('-q1', type=str)
    parser.add_argument('-q2', type=str)
    parser.add_argument('-d', type=str)
    parser.add_argument('-pool', type=str)
    parser.add_argument('-m', type=int)
    parser.add_argument('-save_path', type=str)
    parser.add_argument('--bc_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--pool_size', type=int, default=10)
    parser.add_argument('--max_len', type=int, default=64)
    parser.add_argument('--language', type=str, default='en')
    parser.add_argument('--pt_path', type=str, default='none')
    args = parser.parse_args()

    query1_path = args.q1
    query2_path = args.q2
    document_path = args.d
    pool_path = args.pool
    max_step = args.m
    batch_size = args.bc_size
    lr = args.lr
    pool_size = args.pool_size
    max_len = args.max_len
    lang_code = mbart_lang_to_id[args.language]
    save_path = args.save_path

    logging.info(f'Load query from {query1_path}-{query2_path} and document from {document_path}')

    query1 = read_pkl(query1_path)
    query2 = read_pkl(query2_path)
    if document_path != 'redis':
        document = []
        for i in range(200):
            if os.path.exists(f'{document_path}/{i}.pkl'):
                document.extend(read_pkl(f'{document_path}/{i}.pkl'))
    else:
        document = document_path
    knowledge_pool = read_pkl(pool_path)

    dataset = DuoData(query1, query1, query2, query2, knowledge_pool, pool_size=pool_size, knowledge=document,
                      order=None, max_len=max_len, lang_code=lang_code, curriculum=max_step)
    test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt',
                                max_len=max_len, lang_code=lang_code)

    logging.info('Build retriever')
    retriever = Retriever()
    if torch.cuda.is_available():
        retriever = retriever.cuda()

    optimizer = AdamW(retriever.parameters(), lr)
    pretrained_path = args.pt_path
    if os.path.exists(pretrained_path):
        logging.info(f'Load pretrained model from {pretrained_path}')
        retriever.load_state_dict({k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items()})

    cur_step = 0
    while cur_step < max_step:
        dataset.set_offset(cur_step)
        logging.info(f'Training step {cur_step} / max step {max_step}')
        train_retriever(retriever, optimizer, dataset, pad_idx=1, batch_size=batch_size, step=10)
        cur_step += 10 * batch_size
        ranks = test_retriever(retriever, test_dataset, pad_idx=1, batch_size=batch_size, epoch=0)
        write_file(ranks, f'{save_path}/ranks/{cur_step}.txt')
        torch.save(retriever.state_dict(), f'{save_path}/retriever/{cur_step}.pt')
Ejemplo n.º 14
0
#!/usr/bin/env python3
from pprint import pprint

import numpy as np

from utils.io import read_pkl

loc_db = read_pkl('tmp/location.pkl')
categorical = read_pkl('tmp/categorical.pkl')
labels = read_pkl('tmp/labels.pkl')
nodes = read_pkl('tmp/nodes.pkl')
u_m_pair = read_pkl('tmp/user_miss_pair.pkl')


def get_test_mask():
    return [nodes.index(el) for el in u_m_pair]


if __name__ == '__main__':

    labels = np.argmax(labels, axis=1)
    test_mask = get_test_mask()

    counter = {}

    for place in nodes:
        if place[-1] == '?':
            continue

        c = loc_db[place]['tag']
Ejemplo n.º 15
0
#!/usr/bin/env python3
import numpy as np

from utils.io import read_pkl, save_pkl

if __name__ == '__main__':

    loc_db = read_pkl('tmp/location.pkl')
    tag2class = read_pkl('tmp/tag2class.pkl')

    candidate = {}

    with open('raw/candidate_100_places.txt', 'r') as f:
        lines = f.readlines()
        lines = [el.rstrip('\n') for el in lines]

    for place in lines:
        candidate[place] = loc_db[place]
        tag = candidate[place]['tag']
        label = tag if tag not in tag2class else tag2class[tag]
        loc_db[place]['class'] = label

    with open('raw/checkins_missing.txt', 'r') as f:
        for line in f:
            user, checkins = line.rstrip('\n').split(':')

            checkins = checkins.split(',')
            checkins = [(int(checkins[i]), checkins[i + 1])for i in range(0, len(checkins), 2)]

            for checkin in checkins:
                if checkin[1] == '?':
Ejemplo n.º 16
0
#!/usr/bin/env python3
import numpy as np

from utils.io import read_pkl, save_pkl

if __name__ == '__main__':

    loc_db = read_pkl('tmp/location.pkl')
    user_checkins = {}

    c = 0
    with open('raw/checkins_missing.txt', 'r') as f:
        for line in f:
            user, checkins = line.rstrip('\n').split(':')

            checkins = checkins.split(',')
            checkins = [(int(checkins[i]), checkins[i + 1])
                        for i in range(0, len(checkins), 2)]
            checkins = [el for el in checkins if el[1] in loc_db]
            # checkins = [el for el in checkins if el[1] in loc_db and loc_db[el[1]]['country'] == 'US']

            if user not in user_checkins:
                user_checkins[user] = set()

            for checkin in checkins:
                user_checkins[user].add(checkin[1])

    save_pkl('tmp/user_checkins.pkl', user_checkins)