Ejemplo n.º 1
0
def load_questions(p=defaultqp):
    tt = q.ticktock("question loader")
    tt.tick("loading questions")
    questions, queries = q.StringMatrix(), q.StringMatrix()
    xquestions, xqueries = q.StringMatrix(), q.StringMatrix()

    queries.tokenize = lambda x: x.split()
    xqueries.tokenize = lambda x: x.split()

    with open(p + ".train.butd") as f:
        for line in f:
            qid, question, query, replacements = line.split("\t")
            questions.add(question)
            queries.add(query)

    questions.finalize()
    queries.finalize()

    with open(p + ".test.butd") as f:
        for line in f:
            qid, question, query, replacements = line.split("\t")
            xquestions.add(question)
            xqueries.add(query)

    xquestions.finalize()
    xqueries.finalize()
    tt.tock("loaded questions")
    return (questions, queries), (xquestions, xqueries)
Ejemplo n.º 2
0
def load_questions_inone(p=defaultqp):
    tt = q.ticktock("question loader")
    tt.tick("loading questions")
    questions, queries = q.StringMatrix(), q.StringMatrix()
    qids = []
    queries.tokenize = lambda x: x.split()

    with open(p + ".train.butd") as f:
        for line in f:
            qid, question, query, replacements = line.split("\t")
            questions.add(question)
            queries.add(query)
            qids.append(qid)

    tx_sep = len(qids)

    with open(p + ".test.butd") as f:
        for line in f:
            qid, question, query, replacements = line.split("\t")
            questions.add(question)
            queries.add(query)
            qids.append(qid)

    questions.finalize()
    queries.finalize()

    return questions, queries, qids, tx_sep
Ejemplo n.º 3
0
def load_data(p="../../../data/lcquad-fql/"):
    trainp = os.path.join(p, "train.json")
    testp = os.path.join(p, "test.json")
    print(f"Loading data from: '{trainp}' (train) and '{testp}' (test)")
    traindata = json.load(open(trainp))
    print(f"Number of training examples: {len(traindata)}")
    testdata = json.load(open(testp))
    print(f"Number of test examples: {len(testdata)}")

    # process logical forms
    # parse to trees, replace entities with placeholders in queries
    tdata = [(example["question"],
              ent2placeholder(fql2tree(example["logical_form"])))
             for example in traindata]
    xdata = [(example["question"],
              ent2placeholder(fql2tree(example["logical_form"])))
             for example in testdata]

    # get node types that have children
    parentnodes = set()
    for (_, e) in tdata + xdata:
        que = [e]
        while len(que) > 0:
            head = que.pop(0)
            if head.children is not None:
                assert (len(head.children) > 0)
                parentnodes.add(head.name)
                que += head.children

    print(
        f"Types of nodes that have children ({len(parentnodes)}): \n{parentnodes}"
    )

    # build string matrices
    teststart = len(tdata)
    xsm = q.StringMatrix(indicate_start_end=True)
    ysm = q.StringMatrix(indicate_start=True)
    ysm.tokenize = lambda x: x.split()
    for question, l in tdata + xdata:
        xsm.add(question)
        ysm.add(l.to_transitions() + " <MASK>")
    xsm.finalize()
    ysm.finalize()

    tok2act = {}
    for tok in ysm.D:
        if tok == "<RED>":
            tok2act[ysm.D[tok]] = 2
        elif tok in parentnodes:
            tok2act[ysm.D[tok]] = 1
        else:
            tok2act[ysm.D[tok]] = 0
    return xsm, ysm, teststart, tok2act
Ejemplo n.º 4
0
def load_word_mat(
        origp="../../data/buboqa/data/processed_simplequestions_dataset/"):
    outp = os.path.join(origp, "all.pkl")
    generate = True
    if generate:
        trainp = os.path.join(origp, "train.txt")
        validp = os.path.join(origp, "valid.txt")
        testp = os.path.join(origp, "test.txt")
        trainlines = open(trainp, encoding="utf8").readlines()
        validlines = open(validp, encoding="utf8").readlines()
        testlines = open(testp, encoding="utf8").readlines()
        sm = q.StringMatrix()
        sm.tokenize = lambda x: x.split()

        i = 0
        for line in tqdm(trainlines):
            sm.add(line.split("\t")[5])
            i += 1
        devstart = i
        for line in tqdm(validlines):
            sm.add(line.split("\t")[5])
            i += 1
        teststart = i
        for line in tqdm(testlines):
            sm.add(line.split("\t")[5])

        sm.finalize()
        print(len(sm.D))
        print(sm[0])
        pkl.dump((sm.matrix, sm.D, (devstart, teststart)), open(outp, "wb"))
    else:
        wordmat, wordD, (devstart, teststart) = pkl.load(open(outp, "rb"))
        return wordmat, wordD, (devstart, teststart)
Ejemplo n.º 5
0
def build_entity_matrices(info):
    tt = q.ticktock("entity matrix builder")
    tt.tick("building")
    # build
    ids = []
    names = q.StringMatrix()
    names.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x]
    nameschars = q.StringMatrix()
    aliases = q.StringMatrix()
    aliases.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x]
    nameschars.tokenize = lambda x: " ".join(q.tokenize(x)
                                             ) if x != "<RARE>" else [x]
    typenames = q.StringMatrix()
    typenames.tokenize = lambda x: q.tokenize(
        x, preserve_patterns=['<[A-Z]+>']) if x != "<RARE>" else [x]
    types = q.StringMatrix()
    types.tokenize = lambda x: x
    notabletypenames = q.StringMatrix()
    notabletypenames.tokenize = lambda x: q.tokenize(
        x) if x != "<RARE>" else [x]
    for key, val in info.items():
        ids.append(key)
        name = list(val["name"])[0] if val["name"] is not None else "<RARE>"
        names.add(name)
        nameschars.add(name)
        alias = " <SEP> ".join(list(
            val["aliases"])) if val["aliases"] is not None else "<RARE>"
        aliases.add(alias)
        typename = " <SEP> ".join(list(
            val["typenames"])) if val["typenames"] is not None else "<RARE>"
        typenames.add(typename)
        typ = list(val["types"]) if val["types"] is not None else ["<RARE>"]
        types.add(typ)
        notabletypename = list(
            val["notabletypenames"]
        )[0] if val["notabletypenames"] is not None else "<RARE>"
        notabletypenames.add(notabletypename)
    tt.tock("built")

    tt.tick("finalizing")
    names.finalize()
    nameschars.finalize()
    aliases.finalize()
    typenames.finalize()
    notabletypenames.finalize()
    types.finalize()
    tt.tock("finalized")

    edic = dict(zip(ids, range(len(ids))))

    return edic, names, nameschars, aliases, typenames, notabletypenames, types
Ejemplo n.º 6
0
def load_data(p="../../data/buboqa/data/processed_simplequestions_dataset/",
              relp="../../data/buboqa/data/rels.txt",
              typep="../../data/buboqa/data/ent2type.pkl",
              outp="../../data/buboqa/data/bertified_dataset_v2",
              ):
    tt = q.ticktock("dataloader")
    tt.tick("loading files")
    trainlines = open(p+"train.txt", encoding="utf8").readlines()
    devlines = open(p+"valid.txt", encoding="utf8").readlines()
    testlines = open(p+"test.txt", encoding="utf8").readlines()
    allrels = [x.strip() for x in open(relp).readlines()]
    ent2type = pkl.load(open(typep, "rb"))
    tt.tock("files loaded")
    tt.tick("splitting")
    trainlines = [line.strip().split("\t") for line in trainlines]
    devlines = [line.strip().split("\t") for line in devlines]
    testlines = [line.strip().split("\t") for line in testlines]
    tt.tock("splitted")

    tt.tick("doing some stats")
    stt = q.ticktock("datastats")
    trainrels = set([line[3] for line in trainlines])
    devrels = set([line[3] for line in devlines])
    testrels = set([line[3] for line in testlines])
    unkrels = set()
    for line in testlines:
        if line[3] not in trainrels:
            unkrels.add(line[0])

    stt.msg("{}/{} unique rels in test not in train ({})"
            .format(len(testrels-trainrels), len(testrels), len(trainrels)))
    stt.msg("{}/{} unique rels in devnot in train ({})"
            .format(len(devrels-trainrels), len(devrels), len(trainrels)))
    stt.msg("{} unique rels".format(len(trainrels | devrels | testrels)))
    stt.msg("{}/{} unkrel cases in test".format(len(unkrels), len(testlines)))

    # print(trainlines[3])

    tt.tick("creating word matrix")
    sm = q.StringMatrix(specialtoks=["<ENT>"], indicate_end=True)
    sm.tokenize = lambda x: x.split()
    wordborders = np.zeros((len(trainlines) + len(devlines) + len(testlines), 2), dtype="int64")

    def do_line(line_, i_):
        try:
            sm.add(line_[5])
            previo = "O"
            ioline = line_[6]
            if "[" in ioline or not "I" in ioline:
                print(ioline)
            ioline = ioline.replace("'", "").replace("[", "").replace("]", "").replace(",", "")
            io = ioline.split() + ["O"]
            k = 0
            for j in range(len(io)):
                if io[j] != previo:
                    if k > 1:
                        print(line_)
                    wordborders[i_, k] = j
                    previo = io[j]
                    k += 1
        except Exception as e:
            print(e)
            print(line_)

    i = 0
    for line in tqdm(trainlines):
        do_line(line, i)
        i += 1
    word_devstart = i
    for line in tqdm(devlines):
        do_line(line, i)
        i += 1
    word_teststart = i
    for line in tqdm(testlines):
        do_line(line, i)
        i += 1

    sm.finalize()
    print(len(sm.D))
    print(sm[0])
    tt.tock("created word matrix")

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    def bertify(line):
        try:
            subj = line[1]
            subjtype = ent2type[subj] if subj in ent2type else "none"
            rel = line[3]
            sent = "[CLS] {} [SEP]".format(line[5].lower())
            span = "O {} O".format(line[6]).split()
            bertsent = []       #tokenizer.basic_tokenizer.tokenize(sent)
            unberter = []
            sent = sent.split()
            bertspan = []
            for i, (token, io) in enumerate(zip(sent, span)):
                berttokens = tokenizer.tokenize(token)
                bertsent += berttokens
                bertspan += [io] * len(berttokens)
                unberter += [i] * len(berttokens)
        except Exception as e:
            print(e)
            print(line)
            # raise e
        return bertsent, bertspan, rel, unberter, subjtype

    k = 1331
    ret = bertify(trainlines[k])
    print(tabulate(ret[0:2]))
    print(ret[2])
    print(tabulate([trainlines[k][5].split(), trainlines[k][6].split()]))

    tt.tick("bertifying")
    bert_tokens_train, bert_io_train, bert_rel_train, unberter_train, bert_type_train = zip(*[bertify(line) for line in trainlines])
    bert_tokens_dev,   bert_io_dev,   bert_rel_dev,   unberter_dev, bert_type_dev   = zip(*[bertify(line) for line in devlines])
    bert_tokens_test,  bert_io_test,  bert_rel_test,  unberter_test, bert_type_test  = zip(*[bertify(line) for line in testlines])
    tt.tock("bertified")

    print(tabulate([bert_tokens_train[3], bert_io_train[3], unberter_train[3]]))
    print(bert_rel_train[3])

    print("{} entities in train have 'none' type".format())

    # construct numpy matrix with ids in bert vocabulary
    # and also, numpy matrix with spans
    # and also, numpy vector of relations and dictionary
    tt.tick("creating token matrix")
    assert(tokenizer.convert_tokens_to_ids(["[PAD]"]) == [0])
    maxlen = max([max([len(x) for x in bert_toks]) for bert_toks
                  in [bert_tokens_train, bert_tokens_dev, bert_tokens_test]])
    print(maxlen)
    tokmat = np.zeros((len(bert_tokens_train) + len(bert_tokens_dev) + len(bert_tokens_test),
                       maxlen), dtype="int32")
    i = 0
    for bert_toks in [bert_tokens_train, bert_tokens_dev, bert_tokens_test]:
        for x in bert_toks:
            xids = tokenizer.convert_tokens_to_ids(x)
            tokmat[i, :len(xids)] = xids
            i += 1
    devstart = len(bert_tokens_train)
    teststart = len(bert_tokens_train) + len(bert_tokens_dev)

    assert(word_devstart == devstart, word_teststart == teststart)

    print(tokmat.shape)
    tt.tock("token matrix created")

    tt.tick("creating io matrix")
    iomat = np.zeros_like(tokmat)
    iobordersmat = np.zeros((tokmat.shape[0], 2), dtype="int32")
    i = 0
    for bert_io in [bert_io_train, bert_io_dev, bert_io_test]:
        for x in bert_io:
            xids = [1 if xe == "O" else 2 for xe in x]
            iomat[i, :len(xids)] = xids
            ioborders = []
            for j in range(1, len(xids)):
                if xids[j] != xids[j-1]:
                    ioborders.append(j)
            iobordersmat[i, :len(ioborders)] = ioborders
            i += 1
    tt.tock("io matrix created")

    # unbert mat
    unbertmat = np.zeros_like(tokmat)
    i = 0
    for unberter in [unberter_train, unberter_dev, unberter_test]:
        for unbert_i in unberter:
            unbertmat[i, :len(unbert_i)] = [xe+1 for xe in unbert_i]
            i += 1

    tt.tick("testing")
    test_i = 1331
    test_tokids = [xe for xe in tokmat[test_i] if xe != 0]
    test_ios = iomat[test_i, :len(test_tokids)]
    test_tokens = tokenizer.convert_ids_to_tokens(test_tokids)
    print(tabulate([test_tokens, test_ios]))
    print(iobordersmat[test_i])
    tt.tock("tested")

    tt.tick("doing relations")
    bert_rel_all = bert_rel_train + bert_rel_dev + bert_rel_test
    allrelwcounts = dict(zip(allrels, [0]*len(allrels)))
    for rel in bert_rel_train:
        allrelwcounts[rel] += 1
    allrelwcounts = sorted(allrelwcounts.items(), key=lambda x: x[1], reverse=True)
    print(allrelwcounts[0])
    tt.msg("{} total unique rels".format(len(allrelwcounts)))
    relD = dict(zip([rel for rel in allrels],
                           range(len(allrels))))
    rels = [relD[xe] for xe in bert_rel_all]
    rels = np.array(rels).astype("int32")
    relcounts = [rel[1] for rel in allrelwcounts]
    relcounts = np.array(relcounts).astype("int32")
    tt.tock("done relations")

    np.savez(outp, wordmat=sm.matrix, worddic=sm.D, wordborders=wordborders,
             tokmat=tokmat, iomat=iomat, tokborders=iobordersmat,
             rels=rels, relD=relD, relcounts=relcounts, unbertmat=unbertmat,
             devstart=devstart, teststart=teststart)

    threshold = 2
    stt.msg("{} unique rels at least {} time(s) in train data".format(
        len([xe for xe in allrelwcounts if xe[1] > threshold]), threshold))
    rarerels = set([xe[0] for xe in allrelwcounts if xe[1] <= threshold])
    testrarecount = 0
    for rel in bert_rel_test:
        if rel in rarerels:
            testrarecount += 1
    stt.msg("{}/{} test examples affected by rare rel".format(
        testrarecount, len(bert_rel_test)
    ))

    tt.tick("reload")
    reloaded = np.load(open(outp+".npz", "rb"))
    _relD = reloaded["relD"].item()
    _tokmat = reloaded["tokmat"]
    print(reloaded["devstart"])
    tt.tock("reloaded")
Ejemplo n.º 7
0
def gen_datasets(which="geo"):
    pprefix = "../data/"
    if which == "geo":
        pprefix = pprefix + "geoqueries/dong2016/"
        trainp = pprefix + "train.txt"
        validp = pprefix + "test.txt"
        testp = pprefix + "test.txt"
    elif which == "atis":
        pprefix += "atis/dong2016/"
        trainp = pprefix + "train.txt"
        validp = pprefix + "dev.txt"
        testp = pprefix + "test.txt"
    elif which == "jobs":
        pprefix += "jobqueries/dong2016/"
        trainp = pprefix + "train.txt"
        validp = pprefix + "test.txt"
        testp = pprefix + "test.txt"
    else:
        raise q.SumTingWongException("unknown dataset")

    nlsm = q.StringMatrix(indicate_start_end=True)
    nlsm.tokenize = lambda x: x.split()
    flsm = q.StringMatrix(
        indicate_start_end=True if which == "jobs" else False)
    flsm.tokenize = lambda x: x.split()
    devstart, teststart, i = 0, 0, 0
    with open(trainp) as tf, open(validp) as vf, open(testp) as xf:
        for line in tf:
            line_nl, line_fl = line.strip().split("\t")
            line_nl = " ".join(line_nl.split(" ")[::-1])
            nlsm.add(line_nl)
            flsm.add(line_fl)
            i += 1
        devstart = i
        for line in vf:
            line_nl, line_fl = line.strip().split("\t")
            line_nl = " ".join(line_nl.split(" ")[::-1])
            nlsm.add(line_nl)
            flsm.add(line_fl)
            i += 1
        teststart = i
        for line in xf:
            line_nl, line_fl = line.strip().split("\t")
            line_nl = " ".join(line_nl.split(" ")[::-1])
            nlsm.add(line_nl)
            flsm.add(line_fl)
            i += 1
    nlsm.finalize()
    flsm.finalize()

    nlmat = torch.tensor(nlsm.matrix).long()
    flmat = torch.tensor(flsm.matrix).long()
    gold = torch.tensor(flsm.matrix[:, 1:]).long()
    gold = torch.cat([gold, torch.zeros_like(gold[:, 0:1])], 1)
    tds = torch.utils.data.TensorDataset(nlmat[:devstart], flmat[:devstart],
                                         gold[:devstart])
    vds = torch.utils.data.TensorDataset(nlmat[devstart:teststart],
                                         flmat[devstart:teststart],
                                         gold[devstart:teststart])
    xds = torch.utils.data.TensorDataset(nlmat[teststart:], flmat[teststart:],
                                         gold[teststart:])
    return (tds, vds, xds), nlsm.D, flsm.D
Ejemplo n.º 8
0
def gen_datasets(which="geo"):
    pprefix = "../data/"
    if which == "geo":
        pprefix = pprefix + "geoqueries/jia2016/"
        trainp = pprefix + "train.txt"
        validp = pprefix + "test.txt"
        testp = pprefix + "test.txt"
    elif which == "atis":
        pprefix += "atis/jia2016/"
        trainp = pprefix + "train.txt"
        validp = pprefix + "dev.txt"
        testp = pprefix + "test.txt"
    elif which == "jobs":
        assert(False) # jia didn't do jobs
        pprefix += "jobqueries"
        trainp = pprefix + "train.txt"
        validp = pprefix + "test.txt"
        testp = pprefix + "test.txt"
    else:
        raise q.SumTingWongException("unknown dataset")

    nlsm = q.StringMatrix(indicate_start_end=True)
    nlsm.tokenize = lambda x: x.split()
    flsm = q.StringMatrix(indicate_start_end=True if which == "jobs" else False)
    flsm.tokenize = lambda x: x.split()
    devstart, teststart, i = 0, 0, 0
    trainwords = set()
    trainwordcounts = {}
    testwords = set()
    trainwords_fl = set()
    trainwordcounts_fl = {}
    testwords_fl = set()
    with open(trainp) as tf, open(validp) as vf, open(testp) as xf:
        for line in tf:
            line_nl, line_fl = line.strip().split("\t")
            line_fl = line_fl.replace("' ", "")
            # line_nl = " ".join(line_nl.split(" ")[::-1])
            nlsm.add(line_nl)
            flsm.add(line_fl)
            trainwords |= set(line_nl.split())
            for word in set(line_nl.split()):
                if word not in trainwordcounts:
                    trainwordcounts[word] = 0
                trainwordcounts[word] += 1
            trainwords_fl |= set(line_fl.split())
            for word in set(line_fl.split()):
                if word not in trainwordcounts_fl:
                    trainwordcounts_fl[word] = 0
                trainwordcounts_fl[word] += 1
            i += 1
        devstart = i
        for line in vf:
            line_nl, line_fl = line.strip().split("\t")
            line_fl = line_fl.replace("' ", "")
            # line_nl = " ".join(line_nl.split(" ")[::-1])
            nlsm.add(line_nl)
            flsm.add(line_fl)
            i += 1
        teststart = i
        for line in xf:
            line_nl, line_fl = line.strip().split("\t")
            line_fl = line_fl.replace("' ", "")
            # line_nl = " ".join(line_nl.split(" ")[::-1])
            nlsm.add(line_nl)
            flsm.add(line_fl)
            testwords |= set(line_nl.split())
            testwords_fl |= set(line_fl.split())
            i += 1
    nlsm.finalize()
    flsm.finalize()

    # region get gate sup
    gatesups = torch.zeros(flsm.matrix.shape[0], flsm.matrix.shape[1]+1, dtype=torch.long)
    for i in range(nlsm.matrix.shape[0]):
        nl_sent = nlsm[i].split()
        fl_sent = flsm[i].split()
        inid = False
        for j, fl_sent_token in enumerate(fl_sent):
            if re.match("_\w+id", fl_sent_token):
                inid = True
            elif fl_sent_token == ")":
                inid = False
            elif fl_sent_token == "(":
                pass
            else:
                if inid:
                    if fl_sent_token in nl_sent:
                        gatesups[i, j] = 1




    # endregion

    # region print analysis
    print("{} unique words in train, {} unique words in test, {} in test but not in train"
          .format(len(trainwords), len(testwords), len(testwords - trainwords)))
    print(testwords - trainwords)
    trainwords_once = set([k for k, v in trainwordcounts.items() if v < 2])
    print("{} unique words in train that occur only once ({} of them is in test)".format(len(trainwords_once), len(trainwords_once & testwords)))
    print(trainwords_once)
    trainwords_twice = set([k for k, v in trainwordcounts.items() if v < 3])
    print("{} unique words in train that occur only twice ({} of them is in test)".format(len(trainwords_twice), len(trainwords_twice & testwords)))
    rarerep = trainwords_once | (testwords - trainwords)
    print("{} unique rare representation words".format(len(rarerep)))
    print(rarerep)

    trainwords_fl_once = set([k for k, v in trainwordcounts_fl.items() if v < 2])
    rarerep_fl = trainwords_fl_once | (testwords_fl - trainwords_fl)
    print("{} unique rare rep words in logical forms".format(len(rarerep_fl)))
    print(rarerep_fl)
    # endregion

    # endregion create datasets
    nlmat = torch.tensor(nlsm.matrix).long()
    flmat = torch.tensor(flsm.matrix).long()
    gold = torch.tensor(flsm.matrix[:, 1:]).long()
    gold = torch.cat([gold, torch.zeros_like(gold[:, 0:1])], 1)
    tds = torch.utils.data.TensorDataset(nlmat[:devstart], flmat[:devstart], gold[:devstart], gatesups[:devstart][:, 1:])
    vds = torch.utils.data.TensorDataset(nlmat[devstart:teststart], flmat[devstart:teststart], gold[devstart:teststart])
    xds = torch.utils.data.TensorDataset(nlmat[teststart:], flmat[teststart:], gold[teststart:])
    # endregion
    return (tds, vds, xds), nlsm.D, flsm.D, rarerep, rarerep_fl
Ejemplo n.º 9
0
def build_relation_matrices(info):
    tt = q.ticktock("relation matrix builder")
    tt.tick("building")

    # build
    ids = []
    names = q.StringMatrix()
    names.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x]
    domains = q.StringMatrix()
    domains.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x]
    ranges = q.StringMatrix()
    ranges.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x]
    urlwords = q.StringMatrix()
    urlwords.tokenize = lambda x: q.tokenize(x, preserve_patterns=['<[A-Z]+>']
                                             ) if x != "<RARE>" else [x]
    urltokens = q.StringMatrix()
    urltokens.tokenize = lambda x: x
    domainids = q.StringMatrix()
    domainids.tokenize = lambda x: x
    rangeids = q.StringMatrix()
    rangeids.tokenize = lambda x: x

    for key, val in info.items():
        ids.append(key)
        name = list(val["name"])[0] if val["name"] is not None else "<RARE>"
        names.add(name)
        domain = list(val["domainname"]
                      )[0] if val["domainname"] is not None else "<RARE>"
        domains.add(domain)
        rangename = list(
            val["rangename"])[0] if val["rangename"] is not None else "<RARE>"
        ranges.add(rangename)
        rangeid = list(
            val["range"]) if val["range"] is not None else ["<RARE>"]
        rangeids.add(rangeid)
        domainid = list(
            val["domain"]) if val["domain"] is not None else ["<RARE>"]
        domainids.add(domainid)
        splits = key[1:].split(".")
        if splits[0] == "user":
            splits = splits[2:]
        while len(splits) < 3:
            splits = ["default"] + splits
        url = ".".join(splits)
        urlword = " <SEP> ".join(splits)
        urlwords.add(urlword)
        urltoken = [".".join(splits[:-2]), splits[-2], splits[-1]]
        urltokens.add(urltoken)
    tt.tock("built")

    tt.tick("finalizing")
    names.finalize()
    domains.finalize()
    ranges.finalize()
    rangeids.finalize()
    domainids.finalize()
    urlwords.finalize()
    urltokens.finalize()
    tt.tock("finalized")

    rdic = dict(zip(ids, range(len(ids))))

    return rdic, names, domains, ranges, domainids, rangeids, urlwords, urltokens