Beispiel #1
0
def docs(dataset_name):
    p = util.Progbar(target=(util.lines_in_file(directories.RAW +
                                                dataset_name)))
    for i, d in enumerate(util.load_json_lines(directories.RAW +
                                               dataset_name)):
        p.update(i + 1)
        yield d
Beispiel #2
0
def write_feature_names():
    util.write_pickle(
        {
            f: i
            for i, f in enumerate(
                next(util.load_json_lines(directories.RAW +
                                          'train'))["pair_feature_names"])
        }, directories.MISC + 'pair_feature_names.pkl')
Beispiel #3
0
def main():

    '''
    The integration test runs agains the input file and compares against the output file, while writing its own output results.txt file

    '''


    def process_item(data):
        db = CustomerDatabase()
        success, code = db.deposit(data["customer_id"], data["id"], data["time"], data["load_amount"])
        if code != 403:
            data = write_output(data["customer_id"], data["id"], success)
        else:
            data = dict(id=data["id"], customer_id=data["customer_id"], accepted=False)
        return data, code

    def write_output(customer_id, txn_id, status):
        data = dict(id=txn_id, customer_id=customer_id, accepted=status)
        logger.info(json.dumps(data))
        return data

    with open(RESULTS_FILE, 'w') as out:
        todos = load_json_lines(INPUT_FILE)
        answers = load_json_lines(OUTPUT_FILE)
        counter = 0
        bad_counter = 0
        assert len(todos) == 1000

        for to_do in todos:

            result, code = process_item(to_do)

            if code != 403:

                result["original_accepted"] = answers[counter]["accepted"]
                if result["accepted"] != answers[counter]["accepted"]:
                    logger.debug("FAIL")
                    bad_counter +=1
                else :
                   print("original =", result["original_accepted"], "found=", result["accepted"])


                counter += 1
        print("MISSED ITEMS: ", bad_counter)
Beispiel #4
0
def load_gold(dataset_name):
    gold = {}
    mention_to_gold = {}
    for doc_gold in util.load_json_lines(directories.GOLD + dataset_name):
        did = int(doc_gold.keys()[0])
        gold[did] = doc_gold[str(did)]
        mention_to_gold[did] = {}
        for gold_cluster in doc_gold[str(did)]:
            for m in gold_cluster:
                mention_to_gold[did][m] = tuple(gold_cluster)
    return gold, mention_to_gold
Beispiel #5
0
def explore_pairwise_features():
    pos_sum, neg_sum = np.zeros(9), np.zeros(9)
    pos_count, neg_count = 0, 0
    for i, d in enumerate(util.load_json_lines(directories.RAW + "train")):
        for key in d["labels"].keys():
            if d["labels"][key] == 1:
                pos_sum += d["pair_features"][key]
                pos_count += 1
            else:
                neg_sum += d["pair_features"][key]
                neg_count += 1
        print "positive counts", list(pos_sum)
        print "negative counts", list(neg_sum)
        print "feature odds", list(
            np.divide(pos_sum / pos_count,
                      (pos_sum / pos_count + neg_sum / neg_count)))
        print
def write_node_data(nid_f, nids, infile, outfile):
    return util.write_json({nid_f(datum): datum for datum in util.load_json_lines(infile)
                            if nid_f(datum) in nids}, outfile)
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'):
    return util.logged_loop(util.load_json_lines(path),
                            util.LoopLogger(100000, util.lines_in_file(path), True))
Beispiel #8
0
def build_dataset(vectors,
                  name,
                  tune_fraction=0.0,
                  reduced=False,
                  columns=None):
    doc_vectors = util.load_pickle(directories.MISC +
                                   name.replace("_reduced", "") +
                                   "_document_vectors.pkl")

    main_pairs = PairDataBuilder(columns)
    tune_pairs = PairDataBuilder(columns)
    main_mentions = MentionDataBuilder(columns)
    tune_mentions = MentionDataBuilder(columns)
    main_docs = DocumentDataBuilder(columns)
    tune_docs = DocumentDataBuilder(columns)

    print "Building dataset", name
    p = util.Progbar(
        target=(2 if reduced else util.lines_in_file(directories.RAW + name)))
    for i, d in enumerate(util.load_json_lines(directories.RAW + name)):
        if reduced and i > 2:
            break
        p.update(i + 1)

        if reduced and tune_fraction != 0:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if i == 0 else (tune_pairs, tune_mentions, tune_docs)
        else:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs)

        ms, ps = mentions.size(), pairs.size()
        mention_positions = {}
        for mention_num in sorted(d["mentions"].keys(), key=int):
            mention_positions[mention_num] = mentions.size()
            mentions.add_mention(
                d["mentions"][mention_num], vectors,
                doc_vectors[d["mentions"][mention_num]["doc_id"]])

        for key in sorted(d["labels"].keys(),
                          key=lambda k:
                          (int(k.split()[1]), int(k.split()[0]))):
            k1, k2 = key.split()
            pairs.add_pair(d["labels"][key], mention_positions[k1],
                           mention_positions[k2],
                           int(d["mentions"][k1]["doc_id"]),
                           int(d["mentions"][k1]["mention_id"]),
                           int(d["mentions"][k2]["mention_id"]),
                           d["pair_features"][key])

        me, pe = mentions.size(), pairs.size()
        docs.add_doc(ms, me, ps, pe, d["document_features"])

    suffix = ("_reduced" if reduced else "")
    if tune_mentions.size() > 0:
        tune_mentions.write(name + "_tune" + suffix)
        tune_pairs.write(name + "_tune" + suffix)
        tune_docs.write(name + "_tune" + suffix)
        main_mentions.write(name + "_train" + suffix)
        main_pairs.write(name + "_train" + suffix)
        main_docs.write(name + "_train" + suffix)
    else:
        main_mentions.write(name + suffix)
        main_pairs.write(name + suffix)
        main_docs.write(name + suffix)
Beispiel #9
0
def write_node_data(nid_f, nids, infile, outfile):
    return util.write_json(
        {
            nid_f(datum): datum
            for datum in util.load_json_lines(infile) if nid_f(datum) in nids
        }, outfile)
Beispiel #10
0
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'):
    return util.logged_loop(
        util.load_json_lines(path),
        util.LoopLogger(100000, util.lines_in_file(path), True))