Beispiel #1
0
 def cluster(self):
     clusters = super(ADMappingMixin, self).cluster()
     word = self.m['word']
     ad_descr = get_ad_word(word, self.m['ad_root'])
     ad_centers = get_ad_centers(word, ad_descr, self.m['ad_root'])
     self.mapping = {}
     for ci, center in enumerate(self._c.centres):
         self.mapping[ci] = max(((int(mid), v_closeness(center, m_center))
                                 for mid, m_center in ad_centers.items()),
                                key=itemgetter(1))[0]
     return clusters
Beispiel #2
0
 def get(self, ctx_path, word):
     ctx = self.load(word, ctx_path)
     contexts = ctx['contexts']
     parsed = get_ad_word(word, self.ad_root)
     sense_by_id = {m['id']: m for m in parsed['meanings']}
     counts = Counter(ans for _, ans in contexts)
     self.render('templates/word.html',
                 word=parsed['word'],
                 senses=sorted(
                     (sid, sense_by_id[sid], count / len(contexts))
                     for sid, count in counts.items()),
                 contexts=contexts)
Beispiel #3
0
def build_senses(word, ad_root, out=None):
    """ Build sense vectors for one word and save them in ``out``.
    """
    ad_word_data = get_ad_word(word, ad_root)
    weights = load_weights(word, root=ad_root)
    train_data = get_ad_train_data(word, ad_word_data)
    senses = {s['id']: {'name': s['name'], 'meaning': s['meaning']}
              for s in ad_word_data['meanings']}
    model = SphericalModel(train_data, weights=weights, senses=senses)
    # Not needed after training
    del model.context_vectors
    del model.train_data
    model.save(word, folder=out)
Beispiel #4
0
 def cluster(self):
     word = self.m['word']
     ad_descr = get_ad_word(word)
     ad_centers = get_ad_centers(word, ad_descr)
     self.mapping = {
         i: int(meaning['id'])
         for i, meaning in enumerate(ad_descr['meanings'])
     }
     # note that the clusters can drift to quite different positions
     centers = np.array([ad_centers[m['id']] for m in ad_descr['meanings']])
     self._c = kmeans.KMeans(self.features,
                             centres=centers,
                             metric='cosine',
                             verbose=0)
     return self._cluster()
Beispiel #5
0
def run_on_word(ctx_filename, ctx_dir, ad_root, **params):
    max_contexts = params.get('max_contexts')
    min_contexts = params.get('min_contexts')
    word = ctx_filename.split('.')[0]
    if word[-1].isdigit():
        return
    result_filename = os.path.join(ctx_dir, word + '.json')
    if os.path.exists(result_filename):
        print(result_filename, "already exists, skiping", file=sys.stderr)
        return True
    with open(os.path.join(ctx_dir, ctx_filename), 'r') as f:
        contexts = [line.split('\t') for line in f]
    if max_contexts and len(contexts) > max_contexts:
        contexts = random.sample(contexts, max_contexts)
    elif not contexts or (min_contexts and len(contexts) < min_contexts):
        return
    ad_word_data = get_ad_word(word, ad_root)
    if ad_word_data is None:
        return
    train_data = get_ad_train_data(word, ad_word_data)
    model = train_model(word, train_data, ad_root, **params)
    if model is None: return
    result = []
    confidences = []
    for x in contexts:
        model_ans, confidence = model(x, with_confidence=True)
        result.append((x, model_ans))
        confidences.append(confidence)
    with open(result_filename, 'w') as f:
        json.dump(
            {
                'word':
                word,
                'contexts':
                result,
                'estimate':
                get_accuracy_estimate(confidences, model.confidence_threshold),
            },
            f,
            ensure_ascii=False)
    return True
Beispiel #6
0
 def get(self, pos):
     name_re = re.compile(r'(\w.*?)\d?\.json', re.U)
     only = self.get_argument('only', None)
     if only:
         words = only.split(',')
     else:
         words = {
             m.groups()[0]
             for m in (name_re.match(filename) for filename in os.listdir(
                 os.path.join(self.ad_root, 'ad'))) if m is not None
         }
     words_info = []
     only_pos = {'ГЛАГ': 'v', 'СУЩ': 's'}[pos]
     ipm = load_ipm(self.ad_root, only_pos=only_pos)
     for w in sorted(words):
         w_info = get_ad_word(w, self.ad_root, with_contexts=False)
         if w_info is not None and w_info['pos'] == pos \
                 and 2 <= len(w_info['meanings']) <= 10:
             w_info['ipm'] = ipm.get(w_info['word'].lower())
             words_info.append(w_info)
     self.render('templates/pos_list.html', pos=pos, words_info=words_info)
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('ad_root')
    parser.add_argument('contexts_root')
    parser.add_argument('word')
    parser.add_argument('--n', type=int, default=100)
    args = parser.parse_args()

    w_info = get_ad_word(args.word, args.ad_root, with_contexts=False)
    with open(os.path.join(args.contexts_root, args.word + '.txt'), 'r') as f:
        contexts = list(f)
    random.seed(1)
    random.shuffle(contexts)
    contexts = contexts[:args.n]
    for m in w_info['meanings'] + [
            dict(name='Другое', id=len(w_info['meanings']) + 1),
            dict(name='Не могу определить', id=0)
    ]:
        print(
            ('\t\t%s: %s\t\t%s' % (m['name'], m.get('meaning', ''), m['id'])))
    for ctx in contexts:
        print(ctx, end='')
Beispiel #8
0
def summary(ad_root, ctx_dir):
    all_freqs = {}
    word_ipm = load_ipm(ad_root)
    for filename in os.listdir(ctx_dir):
        if not filename.endswith('.json') or filename == 'summary.json':
            continue
        with open(os.path.join(ctx_dir, filename), 'r') as f:
            result = json.load(f)
            word = result['word']
            w_meta = get_ad_word(word, ad_root)
            meaning_by_id = {m['id']: m['meaning'] for m in w_meta['meanings']}
            counts = Counter(ans for _, ans in result['contexts'])
            all_freqs[word] = {
                'senses': {
                    ans: dict(meaning=meaning_by_id[ans],
                              freq=cnt / len(result['contexts']))
                    for ans, cnt in counts.items()
                },
                'estimate': result.get('estimate'),
                'is_homonym': w_meta.get('is_homonym', False),
                'ipm': word_ipm.get(word, 0.0),
            }
    with open(os.path.join(ctx_dir, 'summary.json'), 'w') as f:
        json.dump(all_freqs, f, ensure_ascii=False)
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('ad_root')
    arg('contexts_root', type=Path)
    arg('words', type=Path)
    arg('output')
    arg('--limit', type=int, default=100)
    args = parser.parse_args()

    wb = Workbook()
    right_align = Alignment(horizontal='right')
    center_align = Alignment(horizontal='center')

    words = [
        l.strip() for l in args.words.read_text(encoding='utf8').split('\n')
        if l.strip()
    ]
    for i, word in enumerate(words):
        contexts_path = args.contexts_root / '{}.txt'.format(word)
        if not contexts_path.exists():
            print('Contexts for word "{}" not found, skpping'.format(word))
            continue
        contexts = [
            l.split('\t')
            for l in contexts_path.read_text(encoding='utf8').split('\n')
        ]
        contexts = [ctx for ctx in contexts if len(ctx) == 3]
        if not contexts:
            print('No contexts for word "{}", skipping'.format(word))
            continue
        if len(contexts) > args.limit:
            random.seed(42)
            contexts = random.sample(contexts, args.limit)
        else:
            print('Warning: only {} contexts for word "{}"'.format(
                len(contexts), word))

        ad_word = get_ad_word(word, args.ad_root, with_contexts=False)
        if not ad_word:
            print('Word "{}" not found in AD'.format(ad_word))
            continue

        if i == 0:
            ws = wb.active
            ws.title = word
        else:
            ws = wb.create_sheet(word)

        for row, m in enumerate(ad_word['meanings'], 1):
            ws.cell(row=row, column=3, value='{name}: {meaning}'.format(**m))
            ws.cell(row=row, column=4, value=row)
        n_senses = len(ad_word['meanings'])
        ws.cell(row=n_senses + 1, column=3, value='Другое:')
        ws.cell(row=n_senses + 1, column=4, value=n_senses + 1)
        ws.cell(row=n_senses + 2, column=3, value='Не могу определить:')
        ws.cell(row=n_senses + 2, column=4, value=0)

        for row, (left, center, right) in enumerate(contexts, n_senses + 3):
            ws.cell(row=row, column=1, value=left).alignment = right_align
            ws.cell(row=row, column=2, value=center).alignment = center_align
            ws.cell(row=row, column=3, value=right)
            ws.cell(row=row, column=4, value='-').alignment = right_align
        ws.column_dimensions['A'].width = 80
        ws.column_dimensions['B'].width = \
            2 + max(len(center) for _, center, _ in contexts)
        ws.column_dimensions['C'].width = 80

    wb.save(args.output)
Beispiel #10
0
def evaluate_word(word,
                  ad_root,
                  labeled_root,
                  print_errors=False,
                  tsne=False,
                  coarse=False,
                  alt_root=None,
                  alt_senses=False,
                  **model_params):
    word_path = labeled_root.joinpath(word + '.json')
    if not word_path.exists():
        word_path = labeled_root.joinpath(word + '.txt')
    senses, test_data = get_labeled_ctx(str(word_path))
    ad_word_data = get_ad_word(word, ad_root)
    if not ad_word_data:
        print(word, 'no AD data', sep='\t')
        return
    ad_senses = {
        str(i): m['name']
        for i, m in enumerate(ad_word_data['meanings'], 1)
    }
    if set(ad_senses) != set(senses):
        print(word, 'AD/labeled sense mismatch', sep='\t')
        return
    train_data = get_ad_train_data(word, ad_word_data)
    if alt_root:
        senses, test_data, train_data = get_alt_senses_test_train_data(
            alt_root=alt_root,
            word=word,
            test_data=test_data,
            train_data=train_data,
            alt_senses=alt_senses)
    if coarse:
        sense_mapping = get_coarse_sense_mapping(ad_senses)
        inverse_mapping = defaultdict(list)
        for old_id, new_id in sense_mapping.items():
            inverse_mapping[new_id].append(old_id)
        senses = {
            new_id: '; '.join(senses[old_id] for old_id in old_ids)
            for new_id, old_ids in inverse_mapping.items()
        }
        train_data = [(ctx, sense_mapping[old_id])
                      for ctx, old_id in train_data]
        test_data = [(ctx, sense_mapping[old_id]) for ctx, old_id in test_data]
    mfs_baseline = get_mfs_baseline(train_data, test_data)
    fs_baseline = get_fs_baseline(test_data)
    random_baseline = 1 / len(senses)
    model = train_model(word, train_data, ad_root, **model_params)
    if not model:
        print(word, 'no model', sep='\t')
        return
    test_accuracy, max_freq_error, js_div, estimate, answers = \
        evaluate(model, test_data)
    if tsne:
        show_tsne(model, answers, senses, word)
        # train_data = get_ad_train_data(word, ad_word_data)
        # show_tsne(model, [(x, ans, ans) for x, ans in train_data], senses, word)
    if print_errors:
        _print_errors(test_accuracy, answers, ad_word_data, senses)
    examples_per_sense = len(train_data) / len(senses)
    words_per_sense = sum(
        len(tokenize_s(left) + tokenize_s(right))
        for (left, _, right), _ in train_data) / len(senses)
    return (len(senses), mfs_baseline, fs_baseline, random_baseline,
            model.get_train_accuracy(verbose=False), test_accuracy,
            max_freq_error, examples_per_sense, words_per_sense, js_div,
            estimate)