Beispiel #1
0
def show_unk(corpus: SquadCorpus,
             vec_name: str,
             context: bool = True,
             question: bool = True):
    vecs = corpus.get_pruned_word_vecs(vec_name)
    docs = corpus.get_train()

    lower_unk = Counter()
    unk = Counter()

    for doc in docs:
        for para in doc.paragraphs:
            if context:
                for sent in para.text:
                    for word in sent:
                        if word not in vecs:
                            unk[word] += 1
                        word = word.lower()
                        if word not in vecs:
                            lower_unk[word] += 1
            if question:
                for question in para.questions:
                    for word in question.words:
                        if word not in vecs:
                            unk[word] += 1
                        word = word.lower()
                        if word not in vecs:
                            lower_unk[word] += 1

    print("\n".join("%s: %d" % (k, v) for k, v in lower_unk.most_common()))
Beispiel #2
0
def show_in_context_unks(corpus: SquadCorpus, vec_name):
    data = corpus.get_train()
    np.random.shuffle(data)
    vecs = corpus.get_pruned_word_vecs(vec_name)

    for doc in data:
        paragraphs = list(doc.paragraphs)
        np.random.shuffle(paragraphs)
        for para in paragraphs:
            sentences = list(para.text) + [x.words for x in para.questions]
            np.random.shuffle(sentences)
            for words in sentences:
                for i, word in enumerate(words):
                    if word.lower() not in vecs:
                        words[i] = "{{{" + word + "}}}"
                        print(" ".join(words[max(0, i -
                                                 10):min(len(words), i + 10)]))
                        words[i] = word
def main():
  corpus = SquadCorpus()
  if OPTS.normalize_before_ranking:
      normalizer = WordNormalizer()
  else:
      normalizer = None
  if OPTS.use_vec_dist:
    word_vecs = corpus.get_pruned_word_vecs('glove.840B.300d')
    prepro = SquadVectorTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_vecs, word_normalizer=normalizer)
  else:
    prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_normalizer=normalizer)
  orig_data = corpus.get_train() if OPTS.split == 'train' else corpus.get_dev()
  orig_lens = [len(p.text[0]) for doc in orig_data for p in doc.paragraphs
               for q in p.questions] 
  new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1)
  new_lens = [len(p.text) for q in new_data for p in q.paragraphs]
  print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens)))
  print('%d new, mean %.2f words'% (len(new_lens), np.mean(new_lens)))
  if OPTS.out_file:
    write_output(OPTS.split, new_data, OPTS.out_file)
Beispiel #4
0
def show_features(corpus: SquadCorpus, vec_name):
    print("Loading train docs")
    data = corpus.get_train()
    np.random.shuffle(data)
    data = data[:100]

    print("Loading vectors")
    vecs = corpus.get_pruned_word_vecs(vec_name)
    fe = BasicWordFeatures()

    grouped_by_features = defaultdict(Counter)

    print("start")

    for doc in data:
        paragraphs = list(doc.paragraphs)
        np.random.shuffle(paragraphs)
        for para in paragraphs:
            sentences = list(para.text) + [x.words for x in para.questions]
            np.random.shuffle(sentences)
            for words in sentences:
                for i, word in enumerate(words):
                    if word.lower() not in vecs:
                        x = fe.get_word_features(word)
                        for i, val in enumerate(x):
                            if val > 0:
                                grouped_by_features[i][word] += 1

    for i in sorted(grouped_by_features.keys()):
        name = BasicWordFeatures.features_names[i]
        if name in ["Len"]:
            continue
        vals = grouped_by_features[i]
        print()
        print("*" * 30)
        print("%s-%d %d (%d)" % (name, i, len(vals), sum(vals.values())))
        for k, v in vals.most_common(30):
            print("%s: %d" % (k, v))