Beispiel #1
0
def test_squad_to_h5py_dataset():
    corenlp = None
    try:
        port = get_free_port()
        corenlp = start_corenlp(port)

        test_dir = tempfile.mkdtemp()
        json_path = os.path.join(test_dir, 'data.json')
        h5_path = os.path.join(test_dir, 'data.h5')
        with open(json_path, 'w') as json_file:
            print(TEST_SQUAD_RAW_DATA, file=json_file)
        squad_to_h5py_dataset(json_path, h5_path,
                              "http://localhost:{}".format(port))
        with h5py.File(h5_path, 'r') as h5_file:
            vocab = Vocabulary.build(h5_file['text'], top_k=100)
        add_words_ids_to_squad(h5_path, vocab)

        dataset = SQuADDataset(h5_path, ('all', ))
        stream = dataset.get_example_stream()
        stream = dataset.apply_default_transformers(stream)
        example = next(stream.get_epoch_iterator(as_dict=True))
        answer_span = slice(example['answer_begins'][0],
                            example['answer_ends'][0])
        assert example['questions'].tolist() == map(vocab.word_to_id, [
            u'To', u'whom', u'did', u'the', u'Virgin', u'Mary', u'allegedly',
            u'appear', u'in', u'1858', u'in', u'Lourdes', u'France', u'?'
        ])
        assert example['contexts'][answer_span].tolist() == map(
            vocab.word_to_id, [u'Saint', u'Bernadette', u'Soubirous'])
    finally:
        if corenlp and corenlp.returncode is None:
            corenlp.kill()
Beispiel #2
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a vocabulary")
    parser.add_argument("--top-k",
                        type=int,
                        help="Top most frequent words to leave")
    parser.add_argument("--keys-only",
                        action='store_true',
                        help="Build vocab of all keys")
    parser.add_argument("--with-keys",
                        action='store_true',
                        help="Count keys and words in definitions")
    parser.add_argument("dictionary", help="Input dictionary")
    parser.add_argument("vocabulary", help="Output vocabulary")
    args = parser.parse_args()

    text = []
    if args.dictionary.endswith('.json'):
        text = collections.defaultdict(int)
    for f_name in args.dictionary.split(","):
        logging.info("Processing " + f_name)
        assert (f_name.endswith('.json'))
        logging.info(
            "Will build the vocabulary from definitions in a dictionary")
        dict_ = json.load(open(f_name, "r"))
        for word, list_defs in dict_.items():
            if args.keys_only or args.with_keys:
                text[word] += 1
            if not args.keys_only:
                for def_ in list_defs:
                    for def_word in def_:
                        text[def_word] += 1

        logging.info("{} words".format(len(text)))

    vocab = Vocabulary.build(text, args.top_k)
    vocab.save(args.vocabulary)
def main():
    parser = argparse.ArgumentParser(
        "Generate synthetic data and outputs in files")
    parser.add_argument("path",
                        type=str,
                        help="Top most frequent words to leave")
    parser.add_argument("n_primes", type=int, help="# of primes")
    parser.add_argument("n_non_primes", type=int, help="# of non-primes")
    parser.add_argument("features_size", type=int, help="Features size")
    parser.add_argument("markov_order", type=int, help="Markov order")
    parser.add_argument("n_sentences", type=int, help="# sentences")
    parser.add_argument("pc_train", type=float, help="% train sentences")
    parser.add_argument("pc_valid", type=float, help="% valid sentences")
    parser.add_argument("sample_temperature",
                        type=float,
                        default=1.0,
                        help="% valid sentences")
    parser.add_argument("min_sentence_len", type=int, default=6)
    parser.add_argument("max_sentence_len", type=int, default=20)
    parser.add_argument("min_def_len", type=int, default=6)
    parser.add_argument("max_def_len", type=int, default=20)

    args = parser.parse_args()

    print "Number of sentences:", args.n_sentences
    assert (0 < args.pc_train + args.pc_valid < 1)
    assert (os.path.exists(args.path) == False)
    os.makedirs(args.path)
    args.pc_test = 1 - (args.pc_train + args.pc_valid)

    gen = FakeTextGenerator(args.n_primes, args.n_non_primes,
                            args.features_size, args.markov_order,
                            args.sample_temperature, args.min_def_len,
                            args.max_def_len)

    data = gen.create_corpus(args.n_sentences, args.min_sentence_len,
                             args.max_sentence_len, args.pc_train,
                             args.pc_valid)

    train_data, valid_data, test_data = data

    concat_sentences = lambda sentences: [' '.join(s) for s in sentences]
    train_data = concat_sentences(train_data)
    test_data = concat_sentences(test_data)
    valid_data = concat_sentences(valid_data)

    all_data = train_data + valid_data + test_data
    with temporary_content_path('\n'.join(all_data)) as path:
        vocab = Vocabulary.build(path, sort_by='lexicographical')
        vocab.save(os.path.join(args.path, "vocab.txt"))

    dict_json = json.dumps(gen.dictionary)
    write_data(os.path.join(args.path, "dict.json"), dict_json)

    write_data(os.path.join(args.path, "train.txt"), '\n'.join(train_data))
    write_data(os.path.join(args.path, "valid.txt"), '\n'.join(valid_data))
    write_data(os.path.join(args.path, "test.txt"), '\n'.join(test_data))

    args_json = json.dumps(vars(args), indent=4, sort_keys=True)
    write_data(os.path.join(args.path, "params.json"), args_json)

    write_data(os.path.join(args.path, "generator.p"), pickle.dumps(gen))
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--target_coverage_text",
                        type=float,
                        help="Target coverage of text")
    parser.add_argument("--target_coverage_def",
                        type=float,
                        help="Target coverage of def")
    parser.add_argument("--vocab_text", type=str, help="Vocabulary of text")
    parser.add_argument("--vocab_def", type=str, help="Vocabulary of def")
    parser.add_argument("--step_size", type=int, default=30)
    parser.add_argument("--target", type=str, default="Final path")
    args = parser.parse_args()

    vocab_text = Vocabulary(args.vocab_text)
    vocab_def = Vocabulary(args.vocab_def)

    # Greedy solution is optimal
    # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big
    target_coverage_text = np.sum(
        vocab_text.frequencies) * args.target_coverage_text
    target_coverage_def = np.sum(
        vocab_def.frequencies) * args.target_coverage_def
    current_vocab = set([])

    # Of course I could use binsearch
    for id in range(vocab_def.size() / args.step_size):
        for id2 in range(args.step_size):
            current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2))

        current_vocab_mod = set(current_vocab)

        current_coverage_def = 0.0
        current_coverage_text = 0.0

        for w in current_vocab_mod:
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[
                vocab_text.word_to_id(w)]

        id_text = 0
        while current_coverage_text < target_coverage_text:
            while vocab_text.id_to_word(id_text) in current_vocab_mod:
                id_text += 1
                if id_text >= vocab_text.size():
                    raise Exception("Perhaps try lower target coverage")

            w = vocab_text.id_to_word(id_text)
            current_vocab_mod.add(w)
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[id_text]

        if current_coverage_def > target_coverage_def:
            current_vocab = current_vocab_mod
            break

        print(
            "After adding {} words I covered {} of def and {} of text occurences"
            .format(
                len(current_vocab_mod),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    # To be safe rechecking shortlist works
    current_coverage_def = 0
    current_coverage_text = 0
    for w in current_vocab:
        current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)]
        current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id(
            w)]

    print(
        "Sanity check: after adding {} words I covered {} of def and {} of text occurences"
        .format(len(current_vocab),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    vocab_result = Vocabulary.build(
        {word: vocab_text.word_freq(word)
         for word in current_vocab})
    vocab_result.save(args.target)
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--top-k",
                        type=int,
                        help="Top most frequent words to leave")
    parser.add_argument(
        "--vocab-text",
        default=None,
        help="Vocab corresponding to the main if text is a dictionary.")
    parser.add_argument(
        "--weight-dict-entries",
        action='store_true',
        help="Weight dict entries according to the freqs from a vocab.")
    parser.add_argument(
        "--exclude-top-k",
        type=int,
        help="Ignore definitions of a number of most frequent words")
    parser.add_argument(
        "text",
        help=
        "The text to use. Can be a text file or .h5 or a dictionary with format.json in which case you need to use --vocab-text as well."
    )
    parser.add_argument("vocab", help="Destination")
    args = parser.parse_args()

    text = []
    if args.vocab_text:
        text = collections.defaultdict(int)
        vocab_text = Vocabulary(args.vocab_text)
    for f_name in args.text.split(","):
        logging.info("Processing " + f_name)
        if f_name.endswith('.h5'):
            with h5py.File(f_name) as h5_file:
                if 'text' not in h5_file.keys():
                    print("Missing text field from " + f_name)
                text.extend(h5_file['text'][:])
        elif f_name.endswith('.json'):
            logging.info(
                "Will build the vocabulary from definitions in a dictionary")
            dict_ = json.load(open(f_name, "r"))
            for word, list_defs in dict_.items():
                text_vocab_id = vocab_text.word_to_id(word)

                if (text_vocab_id != vocab_text.unk
                        and text_vocab_id < args.exclude_top_k):
                    continue

                for def_ in list_defs:
                    for def_word in def_:
                        if args.weight_dict_entries:
                            text[def_word] += vocab_text.word_freq(word)
                        else:
                            text[def_word] += 1
        else:
            with open(f_name) as file_:

                def data():
                    for line in file_:
                        for word in line.strip().split():
                            try:
                                yield text_type(word, 'utf-8')
                            except:
                                print("Skipped word " + word)

                text.extend(data())
        logging.info("{} words".format(len(text)))

    vocab = Vocabulary.build(text, args.top_k)
    vocab.save(args.vocab)