Ejemplo n.º 1
0
    parser = argparse.ArgumentParser(
        description=
        'Write the list of words in embeddings but not in dict vocabulary')
    parser.add_argument('embeddings', type=str)
    parser.add_argument('vocabulary', type=str)
    parser.add_argument('vocabulary_counts', type=str)
    parser.add_argument('absent_words', type=str)

    args = parser.parse_args()

    print "read first file {}".format(args.embeddings)
    embeddings = read_embedding_file(args.embeddings)
    print "read vocabulary file {}".format(args.vocabulary)
    vocabulary = Vocabulary(args.vocabulary)
    print "read vocabulary for counts estimation file {}".format(
        args.vocabulary_counts)
    vocabulary_counts = Vocabulary(args.vocabulary_counts)

    vocabulary = set(vocabulary.words)  # faster lookup

    absent_in_vocab = set(
        [w for w in embeddings.keys() if w not in vocabulary])
    print("Number of absent words in vocab", len(absent_in_vocab))
    absent_in_vocab = sorted(list(absent_in_vocab),
                             key=lambda w: vocabulary_counts.word_freq(w),
                             reverse=True)

    with open(args.absent_words, 'w') as f:
        for w in absent_in_vocab:
            f.write(w.encode('utf8') + '\n')
Ejemplo n.º 2
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--target_coverage_text",
                        type=float,
                        help="Target coverage of text")
    parser.add_argument("--target_coverage_def",
                        type=float,
                        help="Target coverage of def")
    parser.add_argument("--vocab_text", type=str, help="Vocabulary of text")
    parser.add_argument("--vocab_def", type=str, help="Vocabulary of def")
    parser.add_argument("--step_size", type=int, default=30)
    parser.add_argument("--target", type=str, default="Final path")
    args = parser.parse_args()

    vocab_text = Vocabulary(args.vocab_text)
    vocab_def = Vocabulary(args.vocab_def)

    # Greedy solution is optimal
    # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big
    target_coverage_text = np.sum(
        vocab_text.frequencies) * args.target_coverage_text
    target_coverage_def = np.sum(
        vocab_def.frequencies) * args.target_coverage_def
    current_vocab = set([])

    # Of course I could use binsearch
    for id in range(vocab_def.size() / args.step_size):
        for id2 in range(args.step_size):
            current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2))

        current_vocab_mod = set(current_vocab)

        current_coverage_def = 0.0
        current_coverage_text = 0.0

        for w in current_vocab_mod:
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[
                vocab_text.word_to_id(w)]

        id_text = 0
        while current_coverage_text < target_coverage_text:
            while vocab_text.id_to_word(id_text) in current_vocab_mod:
                id_text += 1
                if id_text >= vocab_text.size():
                    raise Exception("Perhaps try lower target coverage")

            w = vocab_text.id_to_word(id_text)
            current_vocab_mod.add(w)
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[id_text]

        if current_coverage_def > target_coverage_def:
            current_vocab = current_vocab_mod
            break

        print(
            "After adding {} words I covered {} of def and {} of text occurences"
            .format(
                len(current_vocab_mod),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    # To be safe rechecking shortlist works
    current_coverage_def = 0
    current_coverage_text = 0
    for w in current_vocab:
        current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)]
        current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id(
            w)]

    print(
        "Sanity check: after adding {} words I covered {} of def and {} of text occurences"
        .format(len(current_vocab),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    vocab_result = Vocabulary.build(
        {word: vocab_text.word_freq(word)
         for word in current_vocab})
    vocab_result.save(args.target)
Ejemplo n.º 3
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--top-k",
                        type=int,
                        help="Top most frequent words to leave")
    parser.add_argument(
        "--vocab-text",
        default=None,
        help="Vocab corresponding to the main if text is a dictionary.")
    parser.add_argument(
        "--weight-dict-entries",
        action='store_true',
        help="Weight dict entries according to the freqs from a vocab.")
    parser.add_argument(
        "--exclude-top-k",
        type=int,
        help="Ignore definitions of a number of most frequent words")
    parser.add_argument(
        "text",
        help=
        "The text to use. Can be a text file or .h5 or a dictionary with format.json in which case you need to use --vocab-text as well."
    )
    parser.add_argument("vocab", help="Destination")
    args = parser.parse_args()

    text = []
    if args.vocab_text:
        text = collections.defaultdict(int)
        vocab_text = Vocabulary(args.vocab_text)
    for f_name in args.text.split(","):
        logging.info("Processing " + f_name)
        if f_name.endswith('.h5'):
            with h5py.File(f_name) as h5_file:
                if 'text' not in h5_file.keys():
                    print("Missing text field from " + f_name)
                text.extend(h5_file['text'][:])
        elif f_name.endswith('.json'):
            logging.info(
                "Will build the vocabulary from definitions in a dictionary")
            dict_ = json.load(open(f_name, "r"))
            for word, list_defs in dict_.items():
                text_vocab_id = vocab_text.word_to_id(word)

                if (text_vocab_id != vocab_text.unk
                        and text_vocab_id < args.exclude_top_k):
                    continue

                for def_ in list_defs:
                    for def_word in def_:
                        if args.weight_dict_entries:
                            text[def_word] += vocab_text.word_freq(word)
                        else:
                            text[def_word] += 1
        else:
            with open(f_name) as file_:

                def data():
                    for line in file_:
                        for word in line.strip().split():
                            try:
                                yield text_type(word, 'utf-8')
                            except:
                                print("Skipped word " + word)

                text.extend(data())
        logging.info("{} words".format(len(text)))

    vocab = Vocabulary.build(text, args.top_k)
    vocab.save(args.vocab)