コード例 #1
0
def wordnetFirstSenseBaseline(mentions, mention_map, predsf):
    predictions, correct = [], 0
    for m in mentions:
        # m.candidates is the (ranked) list returned by WordNet
        # Look, I have no idea how Raganato et al. got their list
        # out. But this can be scrapped, because it VASTLY
        # underperforms their FirstSense baseline.
        (_, _, lemma) = mention_map[m.ID]
        synsets = wn.synsets(lemma)
        if lemma == 'peculiar':
            print(synsets)
        found_it = False
        for j in range(len(synsets)):
            this_lemma = synsets[j].lemmas()[0].name()
            if lemma == 'peculiar':
                print(j, this_lemma)
            if this_lemma == lemma:
                found_it = True
                break
        if not found_it:
            j = 0
        guess = synsets[j].lemmas()[0].key()
        if lemma == 'peculiar':
            print(j, guess)
        #guess = wn.synsets(lemma)[0].lemmas()[0].key()
        #guess = m.candidates[0]
        predictions.append((m.ID, guess))
        if guess == m.CUI:
            correct += 1

    writeWSDFrameworkPredictions(predictions, mention_map, predsf)
    log.writeln('-- WordNet first sense baseline --')
    log.writeln('Accuracy: {0:.4f} ({1:,}/{2:,})\n'.format(
        float(correct) / len(predictions), correct, len(predictions)))
コード例 #2
0
    def __init__(self, datadir=None, verbose=False):
        self._ambig_sets = {}

        if not datadir: datadir = _datadir
        for f in glob.glob(os.path.join(datadir, '*_pmids_tagged.arff')):
            if verbose: log.writeln('  >> Parsing %s' % f)
            ambig_set = parser.parseFile(f)
            concept = os.path.basename(f).split('_')[0]
            self._ambig_sets[concept] = ambig_set
コード例 #3
0
def getAllMentions(dataset, window_size, word_filter, concept_filter, log=log):
    samples = []
    log.track(message='  >> Extracted features from {0}/%d documents...' %
              len(dataset),
              writeInterval=1)
    for ambig in dataset:
        for instance in ambig.instances:
            if concept_filter(instance.CUI):
                samples.append(
                    getSingleMention(instance, window_size, word_filter,
                                     ambig.labels))
        log.tick()
    log.writeln()
    return samples
コード例 #4
0
ファイル: learnmap.py プロジェクト: drgriffis/NeuralVecmap
def train(model, src_embs, trg_embs, train_keys, dev_keys, batch_size=5):

    train_keys = list(train_keys)
    dev_keys = list(dev_keys)

    training = True
    batch_start, iter_losses = 0, []
    prev_dev_loss = None
    cur_iter, new_iter = 0, True
    while training:

        if new_iter:
            if cur_iter > 0:
                # run on dev set
                dev_loss = evalOnDev(model,
                                     src_embs,
                                     trg_embs,
                                     dev_keys,
                                     batch_size=batch_size)
                log.writeln("    Iteration %d -- Dev MSE: %f" %
                            (cur_iter, dev_loss))
                if cur_iter > 1 and dev_loss > prev_dev_loss:
                    training = False
                    log.writeln('    >> Reached dev-based convergence <<')
                else:
                    prev_dev_loss = dev_loss
                    # save checkpoint
                    model.checkpoint(cur_iter)

            # set up for next training batch
            random.shuffle(train_keys)
            cur_iter += 1
            batch_start = 0
            iter_losses = []
            new_iter = False

        if training:
            batch_keys = train_keys[batch_start:batch_start + batch_size]
            batch_src = np.array([src_embs[k] for k in batch_keys])
            batch_trg = np.array([trg_embs[k] for k in batch_keys])
            loss = model.train_batch(batch_src, batch_trg)
            iter_losses.append(loss)

            batch_start += batch_size
            if batch_start >= len(train_keys):
                new_iter = True

    model.rollback()
コード例 #5
0
ファイル: experiment.py プロジェクト: OSU-slatelab/JET
def twoModelEvaluate(dataset, ent_emb_wrapper, str_emb_wrapper, sim_metric, log_predictions=False, use_cross=False, cross_only=False, use_mean=False, skips_f=None):
    log.writeln('\n\n  Using cross: %s\n  Using cross only: %s\n  Using mean: %s\n' % (str(use_cross), str(cross_only), str(use_mean)))

    # check to see how many dataset items are comparable
    comparable, full_comparable = [], []
    prepared = prepare(dataset.full_data)

    if skips_f:
        skips = readSkips(skips_f)
        skips = skips.get(dataset.name, set())
    else:
        skips = set()

    for i in range(len(prepared)):
        if i in skips: continue
        full_datum = prepared[i]
        (e_1, str_1, e_2, str_2, _) = full_datum
        if ent_emb_wrapper.knows(e_1) and ent_emb_wrapper.knows(e_2) \
                and str_emb_wrapper.knows(str_1) and str_emb_wrapper.knows(str_2):
            comparable.append(full_datum)
            full_comparable.append(dataset.full_data[i])
        else:
            log.writeln('SKIPPING %d' % i)

    gold, pred = [], []
    for (e_1, str_1, e_2, str_2, gold_metric) in comparable:
        gold.append(gold_metric)
        scores = [
            sim_metric(ent_emb_wrapper[e_1], ent_emb_wrapper[e_2]),
            sim_metric(str_emb_wrapper[str_1], str_emb_wrapper[str_2])
        ]
        if use_cross:
            if cross_only:
                scores = []
            scores.append(sim_metric(ent_emb_wrapper[e_1], str_emb_wrapper[str_2]))
            scores.append(sim_metric(str_emb_wrapper[str_1], ent_emb_wrapper[e_2]))
        if use_mean:
            pred.append(np.mean(scores))
        else:
            pred.append(np.sum(scores))

    if log_predictions:
        logPredictions(full_comparable, pred, gold, dataset.name, log=log)

    (rho, _) = spearmanr(gold, pred)
    return rho, len(comparable), len(dataset.data)
コード例 #6
0
def calculateNearestNeighbors(embeds, outf, top_k=100, batch_size=100, threads=1):
    log.writeln('Calculating nearest neighbors')

    keys = tuple(embeds.keys())
    emb_list = [embeds[k] for k in keys]

    all_ixes = range(len(keys))
    thread_chunks = util.prepareForParallel(all_ixes, threads, data_only=True)

    nn_q = mp.Queue()

    calc_threads = [
        mp.Process(target=_threadedNearestNeighbors, args=(thread_chunks[i], batch_size, top_k, emb_list, nn_q))
            for i in range(threads)
    ]
    collator = mp.Process(target=_collate, args=(keys, (len(keys)//batch_size)+1, nn_q, outf))

    collator.start()
    util.parallelExecute(calc_threads)
    nn_q.put(_SIGNALS.HALT)
    collator.join()
コード例 #7
0
def getAllMentions(datasets, log=log, mention_map_file=None):
    ds_map = {}

    # pre-generate the vocabulary of all datasets
    all_sentences = []
    for ds in datasets:
        all_sentences.extend(ds.sentences_words)
    prepVocabulary(all_sentences,
                   datasets[0].config['Experiment']['TotalVocab'])

    params = ELMoParams(
        options_file=datasets[0].config['ELMo']['Options'],
        weights_file=datasets[0].config['ELMo']['Weights'],
        vocab_file=datasets[0].config['Experiment']['TotalVocab'],
        max_char_len=int(datasets[0].config['ELMo']['MaxCharLen']),
    )
    elmo_batch_size = int(datasets[0].config['ELMo']['BatchSize'])

    sess = tf.Session()
    elmo = ELMoRunner(sess, params)

    samples = []
    for ds in datasets:
        log.writeln('\nProcessing dataset %s...' % ds.name)
        _getELMoMentions(ds.sentences_words,
                         ds.sentences_instances,
                         ds.labels,
                         ds.name,
                         samples,
                         ds_map,
                         elmo,
                         batch_size=elmo_batch_size)

    if mention_map_file:
        with open(mention_map_file, 'w') as stream:
            for (mention_ID, (ds_name, instance_ID, lemma)) in ds_map.items():
                stream.write('%d\t%s\t%s\t%s\n' %
                             (mention_ID, ds_name, instance_ID, lemma))

    return samples
コード例 #8
0
def enumerateWordNetPairs(vocab, outf, write_lemma=False):
    data = []
    in_vocab = lambda synset: synset.lemmas()[0].name() in vocab
    for pos in ['n', 'v', 'a', 'r']:
        n_pairs = 0
        log.writeln('Processing POS "%s"' % pos)
        log.track(message='  >> Processed {0:,} source synsets ({1:,} pairs)', writeInterval=100)
        for synset in wn.all_synsets(pos):
            if in_vocab(synset):
                for (getter, lbl) in [
                    (synset.hyponyms, dataset.Hyponym),
                    (synset.hypernyms, dataset.Hypernym),
                    (synset.member_holonyms, dataset.Holonym),
                    (synset.substance_holonyms, dataset.Holonym),
                    (synset.part_holonyms, dataset.Holonym),
                    (synset.member_meronyms, dataset.Meronym),
                    (synset.substance_meronyms, dataset.Meronym),
                    (synset.part_meronyms, dataset.Meronym),
                ]:
                    for sink in getter():
                        if in_vocab(sink):
                            if write_lemma:
                                src = synset.lemmas()[0].name()
                                snk = sink.lemmas()[0].name()
                            else:
                                src = synset.name()
                                snk = sink.name()

                            data.append((
                                len(data),
                                src,
                                snk,
                                lbl
                            ))
                            n_pairs += 1
            log.tick(n_pairs)
        log.flushTracker(n_pairs)
        log.writeln('')

    dataset.write(data, outf)
コード例 #9
0
def getELMoRepresentations(sentences_words, sentences_instances, semcor_labels,
        unique_sense_IDs, bilm_params):

    sense_embeddings = {}
    for sense_ID in unique_sense_IDs:
        sense_embeddings[sense_ID] = []

    with tf.Session() as sess:
        log.writeln('  (1) Setting up ELMo')
        elmo = ELMoRunner(sess, bilm_params)

        # batch up the data
        sentence_ids = elmo.preprocess(sentences_words)

        batch_size = 25
        num_batches = math.ceil(sentence_ids.shape[0] / batch_size)
        batch_start = 0
        log.writeln('  (2) Extracting sense embeddings from sentences')
        log.track(message='    >> Processed {0}/{1:,} batches'.format('{0:,}',num_batches), writeInterval=5)
        while batch_start < sentence_ids.shape[0]:
            batch_sentence_ids = sentence_ids[batch_start:batch_start + batch_size]
            elmo_sentence_input_ = elmo(batch_sentence_ids)

            for i in range(elmo_sentence_input_.shape[0]):
                sentence_indices = sentences_instances[batch_start+i]
                for (instance_ID, ix) in sentence_indices:
                    senses = semcor_labels[instance_ID]
                    for sense in senses:
                        sense_embeddings[sense].append(
                            elmo_sentence_input_[i][ix]
                        )

            log.tick()
            batch_start += batch_size
    log.flushTracker()

    log.writeln('  (3) Calculating mean per-sense embeddings')
    mean_sense_embeddings = pyemblib.Embeddings()
    for (sense_ID, embedding_list) in sense_embeddings.items():
        if len(embedding_list) > 0:
            mean_sense_embeddings[sense_ID] = np.mean(embedding_list, axis=0)
        else:
            log.writeln('[WARNING] Sense ID "%s" found no embeddings' % sense_ID)
    
    return mean_sense_embeddings
コード例 #10
0
ファイル: nn_saver.py プロジェクト: drgriffis/knn-embedding
def KNearestNeighbors(emb_arr,
                      node_IDs,
                      top_k,
                      neighbor_file,
                      threads=2,
                      batch_size=5,
                      completed_neighbors=None):
    '''docstring goes here
    '''
    # set up threads
    log.writeln('1 | Thread initialization')
    all_indices = list(range(len(emb_arr)))
    if completed_neighbors:
        filtered_indices = []
        for ix in all_indices:
            if not ix in completed_neighbors:
                filtered_indices.append(ix)
        all_indices = filtered_indices
        log.writeln('  >> Filtered out {0:,} completed indices'.format(
            len(emb_arr) - len(filtered_indices)))
        log.writeln('  >> Filtered set size: {0:,}'.format(len(all_indices)))
    #index_subsets = util.prepareForParallel(list(range(len(emb_arr))), threads-1, data_only=True)
    index_subsets = util.prepareForParallel(all_indices,
                                            threads - 1,
                                            data_only=True)
    nn_q = mp.Queue()
    nn_writer = mp.Process(target=_nn_writer,
                           args=(neighbor_file, node_IDs, nn_q))
    computers = [
        mp.Process(target=_threadedNeighbors,
                   args=(index_subsets[i], emb_arr, batch_size, top_k, nn_q))
        for i in range(threads - 1)
    ]
    nn_writer.start()
    log.writeln('2 | Neighbor computation')
    util.parallelExecute(computers)
    nn_q.put(_SIGNALS.HALT)
    nn_writer.join()
コード例 #11
0
def buildGraph(neighbor_files, k):
    log.writeln('Building neighborhood graph...')
    graph = {}

    # construct frequency-weighted edges
    log.track(message='  >> Loaded {0}/%d neighborhood files' % len(neighbor_files), writeInterval=1)
    for neighbor_file in neighbor_files:
        neighborhoods = readNeighbors(neighbor_file, k)
        for (source, neighbors) in neighborhoods.items():
            if graph.get(source, None) is None:
                graph[source] = {}
            for nbr in neighbors:
                graph[source][nbr] = graph[source].get(nbr, 0) + 1
        log.tick()
    log.flushTracker()

    log.writeln('  >> Normalizing edge weights...')
    max_count = float(len(neighbor_files))
    for (source, neighborhood) in graph.items():
        for (nbr, freq) in neighborhood.items():
            graph[source][nbr] = freq/max_count

    log.writeln('Graph complete!')
    return graph
コード例 #12
0
                help='number of threads to use for parallel calculation (default: %default)',
                type='int', default=1)
        parser.add_option('--batch-size', dest='batch_size',
                help='number of samples to process in each batch (default: %default)',
                type='int', default=25)
        parser.add_option('--keys', dest='keysf',
                help='file listing keys to restrict NN analysis to')
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.print_help()
            exit()
        embf, outf = args
        return embf, options.mode, options.keysf, outf, options.top_k, options.batch_size, options.threads, options.logfile
    embf, embf_mode, keysf, outf, top_k, batch_size, threads, logfile = _cli()

    if keysf:
        keys = readKeys(keysf)
        print("Read %d keys to restrict to" % len(keys))
    else:
        keys = None

    t = log.startTimer('Reading embeddings...', newline=False)
    embeds = pyemblib.read(embf, mode=embf_mode, filter_to=keys, lower_keys=True)
    log.stopTimer(t, message='Done! Read %d embeddings ({0:.2f}s)' % len(embeds))

    nearest_neighbors = calculateNearestNeighbors(embeds, outf, top_k=top_k, batch_size=batch_size, threads=threads)
    log.writeln('Wrote nearest neighbors to %s.' % outf)
コード例 #13
0
    def _build(self):
        self._input = tf.placeholder(dtype=tf.float32,
                                     shape=[None, 2, self.p.embedding_dim],
                                     name='embedding_pair_input')
        self._labels = tf.placeholder(dtype=tf.int32,
                                      shape=[None],
                                      name='labels')
        if self._debug:
            log.writeln(str(self._input))
            log.writeln(str(self._labels))
            _input = tf.Print(self._input, [self._input],
                              summarize=_SUMMARIZE,
                              message='Embedding input')
            labels = tf.Print(self._labels, [self._labels],
                              summarize=_SUMMARIZE,
                              message='Labels')
        else:
            _input = self._input
            labels = self._labels

        conv_filters = tf.Variable(
            tf.truncated_normal(
                [
                    2,  # filter height is always 2
                    self.p.filter_width,
                    1,
                    self.p.num_filters
                ],
                #[2, 2, 1, self.p.num_filters],
                #[2, 2, 1, 1],
                stddev=0.5))
        if self._debug:
            log.writeln(str(conv_filters))
            conv_filters = tf.Print(conv_filters, [conv_filters],
                                    summarize=_SUMMARIZE,
                                    message='Convolutional filters')

        cnn = tf.nn.conv2d(
            input=tf.reshape(self._input, [-1, 2, self.p.embedding_dim, 1]),
            filter=conv_filters,
            #strides=[1, 1, 1, 1],
            #strides=[1, 1, 2, 1],
            strides=[1, self.p.filter_vstride, self.p.filter_hstride, 1],
            padding="SAME",
            name='CNN_op')
        if self._debug:
            log.writeln(str(cnn))
            cnn = tf.Print(cnn, [cnn],
                           summarize=_SUMMARIZE,
                           message='CNN output')

        pooled = tf.nn.max_pool(
            value=cnn,
            #ksize=[1, 2, self.p.embedding_dim, 1],
            #strides=[1, 2, self.p.embedding_dim, 1],
            #ksize=[1, 2, 2, 1],
            ksize=[
                1,
                (self.p.filter_vstride % 2) + 1,  # pool height is determined
                # by filter vstride
                self.p.pool_width,
                1
            ],
            #strides=[1, 2, 2, 1],
            strides=[
                1,
                (self.p.filter_vstride % 2) + 1,  # always reduces to 1,
                self.p.pool_hstride,
                1
            ],
            padding='SAME',
            name='max_pooled_CNN')
        if self._debug:
            log.writeln(str(pooled))
            pooled = tf.Print(pooled, [pooled],
                              summarize=_SUMMARIZE,
                              message='Max pooled CNN output')

        #pooled = tf.squeeze(
        #    pooled,
        #    #axis=[1,2]
        #    axis=[1,3]
        #)
        pooled = tf.reshape(
            pooled,
            shape=[
                -1,  # batch_size
                ((self.p.embedding_dim // self.p.pool_hstride) *
                 self.p.num_filters)
            ])
        if self._debug:
            log.writeln(str(pooled))
            pooled = tf.Print(pooled, [pooled],
                              summarize=_SUMMARIZE,
                              message='Squeezed pooled')

        pooled = tf.nn.dropout(pooled, 0.6, name='pooled_with_dropout')
        if self._debug:
            log.writeln(str(pooled))
            pooled = tf.Print(pooled, [pooled],
                              summarize=_SUMMARIZE,
                              message='Dropout pooled')

        full = tf.contrib.layers.fully_connected(pooled,
                                                 self.p.fully_connected_dim,
                                                 activation_fn=tf.nn.relu)
        if self._debug:
            log.writeln(str(full))
            full = tf.Print(full, [full],
                            summarize=_SUMMARIZE,
                            message='Fully connected output')

        output_layer = tf.contrib.layers.fully_connected(
            full,
            self.p.num_classes,
            #activation_fn=tf.nn.relu
            activation_fn=None)
        if self._debug:
            log.writeln(str(output_layer))
            output_layer = tf.Print(output_layer, [output_layer],
                                    summarize=_SUMMARIZE,
                                    message='Output layer')

        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self._labels,
            logits=output_layer,
        )
        self._batch_loss = tf.reduce_sum(loss, )

        self._scores = tf.nn.softmax(output_layer)
        self._predictions = tf.argmax(self._scores, axis=1)

        optimizer = tf.train.AdamOptimizer(learning_rate=0.001,
                                           beta1=0.9,
                                           beta2=0.999,
                                           epsilon=1e-08)
        self._train_step = optimizer.minimize(loss)
コード例 #14
0
ファイル: significance.py プロジェクト: OSU-slatelab/JET
def getStatistics(f1, f2):
    preds1 = readPredictions(f1)
    preds2 = readPredictions(f2)

    for ds in preds1.keys():
        log.writeln(('\n\n{0}\n### %s\n{0}\n\n'.format('#' * 80)) % ds)
        (lbl_scores_1, gold_1) = preds1[ds]
        (lbl_scores_2, gold_2) = preds2[ds]

        (ab, ab_size) = correlation(lbl_scores_1, lbl_scores_2,
                                    '%s -- A vs B' % ds)
        (at, at_size) = correlation(lbl_scores_1, gold_1,
                                    '%s -- A vs GOLD' % ds)
        (bt, bt_size) = correlation(lbl_scores_2, gold_2,
                                    '%s -- B vs GOLD' % ds)

        log.writeln("\n -- %s Agreement summary --" % ds)
        log.writeln("  |r_bt - r_at| = %f" % abs(at - bt))
        log.writeln("  r_ab = %f (%d)" % (ab, ab_size))
        log.writeln("  r_at = %f (%d)" % (at, at_size))
        log.writeln("  r_bt = %f (%d)" % (bt, bt_size))
コード例 #15
0
ファイル: significance.py プロジェクト: OSU-slatelab/JET
                                    '%s -- A vs B' % ds)
        (at, at_size) = correlation(lbl_scores_1, gold_1,
                                    '%s -- A vs GOLD' % ds)
        (bt, bt_size) = correlation(lbl_scores_2, gold_2,
                                    '%s -- B vs GOLD' % ds)

        log.writeln("\n -- %s Agreement summary --" % ds)
        log.writeln("  |r_bt - r_at| = %f" % abs(at - bt))
        log.writeln("  r_ab = %f (%d)" % (ab, ab_size))
        log.writeln("  r_at = %f (%d)" % (at, at_size))
        log.writeln("  r_bt = %f (%d)" % (bt, bt_size))


if __name__ == '__main__':

    def _cli():
        import optparse
        parser = optparse.OptionParser(usage='Usage: %prog LOG1 LOG2')
        parser.add_option('-l', '--logfile', dest='logfile')
        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.print_help()
            exit()
        return args, options.logfile

    (f1, f2), logfile = _cli()
    log.start(logfile=logfile, stdout_also=True)
    log.writeln('A: %s' % f1)
    log.writeln('B: %s' % f2)
    getStatistics(f1, f2)
コード例 #16
0
def crossValidationSplits(dataset, n_folds, dev_size, persistent_path=None, random_seed=1, log=log):
    if persistent_path and os.path.isfile('%s.fold-0.train' % persistent_path):
        log.writeln('Reading pre-existing cross validation splits from %s.' % persistent_path)
        splits = readSplits(persistent_path, n_folds, id_cast=int)
    else:
        log.writeln('Generating cross-validation splits...')
        np.random.seed(random_seed)

        ids_by_class, classes = stratifyByClass(dataset)

        total_size = 0
        for (lbl, ids) in ids_by_class.items():
            total_size += len(ids)
        log.writeln('  Dataset size: {0:,}'.format(total_size))
        log.writeln('  Number of classes: {0:,}'.format(len(classes)))

        # shuffle it
        for _class in classes:
            np.random.shuffle(ids_by_class[_class])

        # figure out how many points of each class per fold
        fold_size_by_class, dev_size_by_class = getFoldAndDevSizeByClass(
            ids_by_class, n_folds, dev_size
        )

        labeled_splits, id_splits = [], []
        for i in range(n_folds):
            train_by_class = {}
            for _class in classes:
                train_by_class[_class] = []

            for j in range(n_folds):
                fold_by_class = {}
                for _class in classes:
                    fold_size = fold_size_by_class[_class]
                    if j < (n_folds - 1):
                        fold_by_class[_class] = ids_by_class[_class][j*fold_size:(j+1)*fold_size]
                    else:
                        fold_by_class[_class] = ids_by_class[_class][j*fold_size:]

                if j == i:
                    test_by_class = fold_by_class.copy()
                else:
                    for (_class, subset) in fold_by_class.items():
                        train_by_class[_class].extend(subset)

            # sample out dev data
            train_by_class, dev_by_class = subsampleDevByClass(
                train_by_class, dev_size_by_class
            )

            # collapse train, dev, test to flat ID lists
            lbl_train, id_train = collapseFromByClass(train_by_class)
            lbl_dev, id_dev = collapseFromByClass(dev_by_class)
            lbl_test, id_test = collapseFromByClass(test_by_class)
            
            labeled_splits.append((lbl_train, lbl_dev, lbl_test))
            id_splits.append((id_train, id_dev, id_test))

            log.writeln('  Fold {0} -- Train: {1:,}  Dev: {2:,}  Test: {3:,}'.format(
                i+1, len(id_train), len(id_dev), len(id_test)
            ))

        if persistent_path:
            log.writeln('Writing cross validation splits to %s.' % persistent_path)
            writeSplits(labeled_splits, persistent_path)

        splits = id_splits
    log.writeln()

    return splits
コード例 #17
0
def runModel(mentions,
             entity_embeds,
             ctx_embeds,
             minibatch_size,
             preds_file,
             debug=False,
             secondary_entity_embeds=None,
             entity_combo_method=None,
             using_mention=False,
             preds_file_detailed=None,
             preferred_strings=None,
             preds_file_polysemy=None,
             polysemy=None):
    entity_vocab, entity_arr = entity_embeds.toarray()
    ctx_vocab, ctx_arr = ctx_embeds.toarray()
    if secondary_entity_embeds:
        secondary_entity_vocab, secondary_entity_arr = secondary_entity_embeds.toarray(
        )
        secondary_entity_arr_2 = []
        for v in secondary_entity_vocab:
            secondary_entity_arr_2.append(np.array(secondary_entity_embeds[v]))
        secondary_entity_arr_2 = np.array(secondary_entity_arr_2)
    else:
        secondary_entity_vocab, secondary_entity_arr = None, None

    ent_ixer = Indexer(entity_vocab)
    ctx_ixer = Indexer(ctx_vocab)
    if secondary_entity_embeds:
        secondary_ent_ixer = Indexer(secondary_entity_vocab)
    else:
        secondary_ent_ixer = None

    max_num_entities = 0
    for m in mentions:
        if len(m.candidates) > max_num_entities:
            max_num_entities = len(m.candidates)

    max_mention_size = 0
    for m in mentions:
        n_tokens = len(m.mention_text.split())
        if n_tokens > max_mention_size:
            max_mention_size = n_tokens

    window_size = 5
    params = LLParams(
        ctx_vocab_size=len(ctx_vocab),
        ctx_dim=ctx_embeds.size,
        entity_vocab_size=len(entity_vocab),
        entity_dim=entity_embeds.size,
        secondary_entity_vocab_size=(0 if not secondary_entity_embeds else
                                     len(secondary_entity_vocab)),
        secondary_entity_dim=(0 if not secondary_entity_embeds else
                              secondary_entity_embeds.size),
        window_size=window_size,
        max_num_entities=max_num_entities,
        max_mention_size=max_mention_size,
        entity_combo_method=entity_combo_method,
        using_mention=using_mention)

    session = tf.Session()
    lll = LinearSabbirLinkerC(
        session,
        np.array(ctx_arr),
        np.array(entity_arr),
        params,
        debug=debug,
        secondary_entity_embed_arr=np.array(secondary_entity_arr))

    log.track(message='   >>> Processed {0} batches', writeInterval=10)

    if secondary_entity_embeds:
        ent_vs_sec = McNemars()
        ent_vs_joint = McNemars()
        sec_vs_joint = McNemars()
        joint_vs_oracle = McNemars()

    correct, total = 0., 0
    batch_start = 0
    oracle = {}
    while (batch_start < len(mentions)):
        next_batch_mentions = mentions[batch_start:batch_start +
                                       minibatch_size]
        next_batch = [
            prepSample(mention,
                       ent_ixer,
                       ctx_ixer,
                       window_size,
                       max_mention_size,
                       max_num_entities,
                       secondary_ent_ixer=secondary_ent_ixer)
            for mention in next_batch_mentions
        ]

        batch_ctx_window_ixes = [
            next_batch[i][0] for i in range(len(next_batch))
        ]
        batch_ctx_window_masks = [
            next_batch[i][1] for i in range(len(next_batch))
        ]
        batch_mention_ixes = [next_batch[i][2] for i in range(len(next_batch))]
        batch_mention_masks = [
            next_batch[i][3] for i in range(len(next_batch))
        ]
        batch_entity_ixes = [next_batch[i][4] for i in range(len(next_batch))]
        batch_entity_masks = [next_batch[i][5] for i in range(len(next_batch))]
        if secondary_entity_embeds:
            batch_secondary_entity_ixes = [
                next_batch[i][6] for i in range(len(next_batch))
            ]
        else:
            batch_secondary_entity_ixes = None

        results = lll.getPredictions(
            batch_ctx_window_ixes,
            batch_ctx_window_masks,
            batch_entity_ixes,
            batch_entity_masks,
            batch_secondary_entity_ixes=batch_secondary_entity_ixes,
            batch_mention_ixes=batch_mention_ixes,
            batch_mention_masks=batch_mention_masks,
            oracle=True)
        if secondary_entity_embeds:
            (preds, probs, ent_preds, secondary_ent_preds) = results
        else:
            (preds, probs, ent_preds) = results
        for i in range(len(next_batch)):
            (_, _, _, _, ent_ixes, _, _, correct_candidate,
             mention) = next_batch[i]
            # base accuracy eval
            predicted_ix = ent_ixes[preds[i]]
            if predicted_ix == correct_candidate:
                correct += 1
            total += 1

            # oracle eval
            joint_correct, entity_correct, secondary_correct, oracle_correct = False, False, False, False
            if ent_ixes[ent_preds[i]] == correct_candidate:
                entity_correct = True
                oracle['entity_correct'] = oracle.get('entity_correct', 0) + 1
            if secondary_entity_embeds and ent_ixes[
                    preds[i]] == correct_candidate:
                joint_correct = True
                oracle['joint_correct'] = oracle.get('joint_correct', 0) + 1
            if secondary_entity_embeds and ent_ixes[
                    secondary_ent_preds[i]] == correct_candidate:
                secondary_correct = True
                oracle['secondary_correct'] = oracle.get(
                    'secondary_correct', 0) + 1
            if entity_correct or secondary_correct:
                oracle_correct = True
                oracle['oracle_correct'] = oracle.get('oracle_correct', 0) + 1

            # significance tracking
            if secondary_entity_embeds:
                # entity vs secondary
                if entity_correct and secondary_correct:
                    ent_vs_sec.a += 1
                elif entity_correct and (not secondary_correct):
                    ent_vs_sec.b += 1
                elif (not entity_correct) and secondary_correct:
                    ent_vs_sec.c += 1
                else:
                    ent_vs_sec.d += 1
                # entity vs joint
                if entity_correct and joint_correct:
                    ent_vs_joint.a += 1
                elif entity_correct and (not joint_correct):
                    ent_vs_joint.b += 1
                elif (not entity_correct) and joint_correct:
                    ent_vs_joint.c += 1
                else:
                    ent_vs_joint.d += 1
                # secondary vs joint
                if secondary_correct and joint_correct:
                    sec_vs_joint.a += 1
                elif secondary_correct and (not joint_correct):
                    sec_vs_joint.b += 1
                elif (not secondary_correct) and joint_correct:
                    sec_vs_joint.c += 1
                else:
                    sec_vs_joint.d += 1
                # joint vs oracle
                if joint_correct and oracle_correct:
                    joint_vs_oracle.a += 1
                elif joint_correct and (not oracle_correct):
                    joint_vs_oracle.b += 1
                elif (not joint_correct) and oracle_correct:
                    joint_vs_oracle.c += 1
                else:
                    joint_vs_oracle.d += 1

            # predictions + scores
            if preds_file:
                preds_file.write('Probs: [ %s ]  Pred: %d -> %d  Gold: %d\n' %
                                 (' '.join([str(p)
                                            for p in probs[i]]), preds[i],
                                  ent_ixes[preds[i]], correct_candidate))

            # predictions + corpus polysemy of correct entity
            if preds_file_polysemy:
                try:
                    line = '%d\t%f\n' % (
                        (1 if predicted_ix == correct_candidate else 0),
                        polysemy[ent_ixer[predicted_ix]])
                    preds_file_polysemy.write(line)
                except KeyError:
                    pass

            # predictions, in detail
            if preds_file_detailed:
                keys = ['all']
                if secondary_entity_embeds:
                    pred_ixes = [('Pred (Joint)', ent_ixes[preds[i]]),
                                 ('Pred (Ent)', ent_ixes[ent_preds[i]]),
                                 ('Pred (Defn)',
                                  ent_ixes[secondary_ent_preds[i]])]
                    if entity_correct and secondary_correct:
                        comp_stream_key = 'both_correct'
                    elif entity_correct and (not secondary_correct):
                        comp_stream_key = 'entity_only_correct'
                    elif (not entity_correct) and secondary_correct:
                        comp_stream_key = 'secondary_only_correct'
                    else:
                        comp_stream_key = 'both_wrong'
                    keys.append(comp_stream_key)
                    #if entity_correct and secondary_correct and joint_correct:
                    #    joint_stream_key = None
                    #if entity_correct and secondary_correct and (not joint_correct):
                    #    joint_stream_key = 'ent_sec_no-joint'
                    #elif entity_correct and joint_correct and (not secondary_correct):
                    #    joint_stream_key = 'ent_and_joint'
                    #elif (not entity_correct) and joint_correct and secondary_correct:
                    #    joint_stream_key = 'sec_and_joint'
                    #elif joint_correct and (not entity_correct) and (not secondary_correct):
                    #    joint_stream_key = 'joint_only'
                    #elif entity_correct and (not joint_correct) and (not secondary_correct):
                    #    joint_stream_key = 'ent_no-joint'
                    #elif (not entity_correct) and (not joint_correct) and secondary_correct:
                    #    joint_stream_key = 'sec_no-joint'
                    #elif (not entity_correct) and (not joint_correct) and (not secondary_correct):
                    #    joint_stream_key = None
                    #keys.append(joint_stream_key)
                    if (not entity_correct) and joint_correct:
                        keys.append('ent_joint_help')
                    elif entity_correct and (not joint_correct):
                        keys.append('ent_joint_hurt')
                    if (not secondary_correct) and joint_correct:
                        keys.append('sec_joint_help')
                    if secondary_correct and (not joint_correct):
                        keys.append('sec_joint_hurt')
                else:
                    pred_ixes = [('Pred', predicted_ix)]
                    if entity_correct:
                        stream_key = 'entity_correct'
                    else:
                        stream_key = 'entity_wrong'
                    keys.append(stream_key)
                for k in keys:
                    _writeDetailedOutcome(preds_file_detailed[k], mention,
                                          probs, batch_entity_ixes,
                                          batch_entity_masks, ent_ixer,
                                          preferred_strings, correct_candidate,
                                          pred_ixes, i)

        batch_start += minibatch_size
        log.tick()
    log.flushTracker()

    for (msg, mcn) in [('Entity vs Defn', ent_vs_sec),
                       ('Entity vs Joint', ent_vs_joint),
                       ('Defn vs Joint', sec_vs_joint),
                       ('Joint vs Oracle', joint_vs_oracle)]:
        chi2, pval = mcn.run()
        log.writeln('\n%s\n'
                    '    | a = %5d | b = %5d |\n'
                    '    | c = %5d | d = %5d |\n'
                    '  Chi^2 = %f  P-value = %f\n' %
                    (msg, mcn.a, mcn.b, mcn.c, mcn.d, chi2, pval))

    return correct, total, oracle
コード例 #18
0
ファイル: experiment.py プロジェクト: OSU-slatelab/JET
    log.start(logfile=options.logfile, stdout_also=True)

    configlogger.writeConfig(output=log, settings=[
        ('Dataset', options.mode),
        ('Using skip indices', ('None' if not options.skips_f else options.skips_f)),
        ('Embedding settings', em.logCLIOptions(options)),
        ('Scoring settings', OrderedDict([
            ('Combination of entity and string', options.use_combo),
            ('Cross comparison of entity/string', options.use_cross),
            ('Cross comparison only', options.cross_only),
            ('Using mean of scores instead of sum', options.use_mean)
        ])),
    ], title='Similarity/Relatedness experiment')

    if not options.use_combo:
        log.writeln('\nMode: %s   Method: %s\n' % (options.mode, em.name(options.repr_method)))
        separator = '\t' if options.tab_sep else ' '
        emb_wrapper = em.getEmbeddings(options, log=log, separator=separator)
    else:
        log.writeln('\nMode: %s   Method: COMBO\n' % options.mode)
        ent_embf, word_embf = options.ent_embf, options.word_embf
        separator = '\t' if options.tab_sep else ' '

        options.repr_method = em.ENTITY
        options.word_embf = None
        ent_emb_wrapper = em.getEmbeddings(options, log=log, separator=separator)

        options.repr_method = em.WORD
        options.ent_embf = None
        options.word_embf = word_embf
        str_emb_wrapper = em.getEmbeddings(options, log=log, separator=separator)
コード例 #19
0
    configlogger.writeConfig(log, [
        ('SemCor', [
            ('XML', config['SemCor']['XML']),
            ('Labels', config['SemCor']['Labels']),
        ]),
        ('Output file', config['SemCor']['Lemmas']),
    ])

    t_sub = log.startTimer('Pre-processing SemCor text from %s...' %
                           config['SemCor']['XML'])
    (sentences_words, sentences_instances) = wsd_parser.processSentences(
        config['SemCor']['XML'], get_lemmas=True)
    log.stopTimer(t_sub,
                  message='Read {0:,} sentences in {1}s.\n'.format(
                      len(sentences_words), '{0:.2f}'))

    log.writeln('Collecting set of SemCor lemmas...')
    lemmas = set()
    for sentence_instances in sentences_instances:
        for (instance_ID, ix, lemma) in sentence_instances:
            lemmas.add(lemma)
    log.writeln('Found {0:,} distinct lemmas.\n'.format(len(lemmas)))

    log.writeln('Writing list of lemmas to %s...' % config['SemCor']['Lemmas'])
    with codecs.open(config['SemCor']['Lemmas'], 'w', 'utf-8') as stream:
        for lemma in lemmas:
            stream.write('%s\n' % lemma)
    log.writeln('Done.\n')

    log.stop()
コード例 #20
0
def ELMoBaseline(mentions, mention_map, backoff_preds, training_lemmas,
                 semcor_embeddings, output_predsf):
    log.writeln('Running ELMo baseline\n')

    # pre-norm the semcor embeddings
    log.writeln('Norming SemCor embeddings...')
    normed_semcor_embeddings = pyemblib.Embeddings()
    for (k, v) in semcor_embeddings.items():
        normed_semcor_embeddings[k] = (v / np.linalg.norm(v))
    #semcor_embeddings = normed_semcor_embeddings
    ordered_vocab, semcor_embeddings = normed_semcor_embeddings.toarray()
    semcor_embeddings = np.transpose(semcor_embeddings)
    log.writeln('Done.\n')

    predictions, correct = [], 0
    num_elmo, num_backoff = 0, 0

    log.track(
        message='  >> Processed {0:,}/%s samples ({1:,} ELMo, {2:,} backoff)' %
        ('{0:,}'.format(len(mentions))),
        writeInterval=5)
    for m in mentions:
        (ds, instance_ID, lemma) = mention_map[m.ID]
        if lemma in training_lemmas:
            #prediction = getNearestNeighborKey(m.context_repr, semcor_embeddings)
            prediction = getNearestNeighborKey2(m.context_repr,
                                                semcor_embeddings,
                                                ordered_vocab)
            num_elmo += 1
        else:
            prediction = backoff_preds[predictionID(ds, instance_ID)]
            num_backoff += 1
        predictions.append((m.ID, prediction))
        if prediction == m.CUI:
            correct += 1
        log.tick(num_elmo, num_backoff)
    log.flushTracker(num_elmo, num_backoff)

    writeWSDFrameworkPredictions(predictions, mention_map, output_predsf)
    log.writeln('\n-- ELMo baseline --')
    log.writeln('Accuracy: {0:.4f} ({1:,}/{2:,})\n'.format(
        float(correct) / len(predictions), correct, len(predictions)))
    log.writeln('# ELMo: {0:,}\n# backoff: {1:,}\n'.format(
        num_elmo, num_backoff))
コード例 #21
0
        if (not options.inputf) or (not options.outputf):
            parser.print_help()
            exit()
        return options

    options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input file', options.inputf),
        ('Output file', options.outputf),
        ('# samples per class', options.size),
        ('Random seed', options.random_seed),
    ], 'WordNet dataset subsampling')

    log.writeln('Reading dataset from %s...' % options.inputf)
    ds = dataset.load(options.inputf)
    log.writeln('Read {0:,} samples.\n'.format(len(ds)))

    log.writeln('Collating by class...')
    collated = collateByClass(ds)
    classes = list(collated.keys())
    classes.sort()
    for c in classes:
        log.writeln('  {0} --> {1:,}'.format(c, len(collated[c])))
        if len(collated[c]) < options.size:
            log.writeln(
                '[WARNING] subsample size is too large for class "{0}"'.format(
                    c))

    log.writeln('\nSubsampling...')
コード例 #22
0
            ('Output predictions file',
             options.elmo_baseline_eval_predictions),
            ('SemCor embeddings', options.semcor_embf),
            ('Training lemmas file', options.training_lemmasf),
            ('Pre-calculated WN first sense backoff predictions',
             options.wordnet_baseline_input_predictions),
        ]),
    ],
                             title="ELMo WSD baselines replication")

    t_sub = log.startTimer('Reading mentions from %s...' % mentionf,
                           newline=False)
    mentions = mention_file.read(mentionf)
    log.stopTimer(t_sub, message='Read %d mentions ({0:.2f}s)' % len(mentions))

    log.writeln('Reading mention dataset data from %s...' %
                options.mention_mapf)
    mention_map = dataset_map_utils.readDatasetMap(options.mention_mapf,
                                                   get_IDs=True,
                                                   get_lemmas=True)
    log.writeln('Mapped dataset info for {0:,} mentions.\n'.format(
        len(mention_map)))

    if options.wordnet_baseline_eval_predictions:
        wordnetFirstSenseBaseline(mentions, mention_map,
                                  options.wordnet_baseline_eval_predictions)
    if options.elmo_baseline_eval_predictions:
        log.writeln('Reading set of training lemmas from %s...' %
                    options.training_lemmasf)
        training_lemmas = readTrainingLemmas(options.training_lemmasf)
        log.writeln('Read {0:,} lemmas.\n'.format(len(training_lemmas)))
コード例 #23
0
        parser.add_option('-k', dest='k',
                help='number of neighbors to use for edge construction (default: %default)',
                type='int', default=10)
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) == 0:
            parser.print_help()
            exit()
        neighbor_files = args
        return neighbor_files, options
    neighbor_files, options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        *[
            ('Neighborhood sample file %d' % (i+1), neighbor_files[i])
                for i in range(len(neighbor_files))
        ],
        ('Output file', options.outputf),
        ('Number of neighbors to include in edge construction', options.k),
    ], 'Nearest neighborhood graph generation')

    graph = buildGraph(neighbor_files, options.k)

    log.write('Writing graph to %s...' % options.outputf)
    writeGraph(graph, options.outputf)
    log.writeln('Done!')

    log.stop()
コード例 #24
0
ファイル: error_analysis.py プロジェクト: OSU-slatelab/JET
            '-l',
            '--logfile',
            dest='logfile',
            help='name of file to write log contents to (empty for stdout)',
            default=None)
        (options, args) = parser.parse_args()
        if len(args) != 3:
            parser.print_help()
            exit()
        if not options.logfile: options.logfile = '%s.analysis.log' % args[2]
        return args, options.logfile

    (resultsf, polysemyf, outf), logfile = _cli()
    log.start(logfile=logfile, stdout_also=True)

    log.writeln('Running sim/rel error analysis')
    log.writeln('  Results file: %s' % resultsf)
    log.writeln('  Polysemy file: %s' % polysemyf)
    log.writeln('  Output files: %s' % outf)

    results = readResults(resultsf)
    polysemy = readPolysemy(polysemyf)
    addPolysemy(results, polysemy)

    for (dataset, dataset_res) in results.items():
        log.writeln('\nDataset: %s' % dataset)
        outfile = '%s.%s.tsv' % (outf, dataset)
        writePolyErrors(dataset_res, outfile)
        log.writeln('  Wrote errors w/ polysemy to: %s' % outfile)

        (coefs, intercept, r_sq) = runRegression(dataset_res)
コード例 #25
0
        return options

    options = _cli()

    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
    log.writeln('Writing remapped embeddings to %s...' % options.outputf)
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format)
    pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True)
    log.writeln('Done!')

    log.stop()
コード例 #26
0
ファイル: learnmap.py プロジェクト: drgriffis/NeuralVecmap
def crossfoldTrain(src_embs,
                   trg_embs,
                   pivot_keys,
                   nfold,
                   activation,
                   num_layers,
                   batch_size=5,
                   checkpoint_file='checkpoint',
                   random_seed=None):

    project_batch_size = batch_size * 10

    pivot_keys = list(pivot_keys)
    if random_seed:
        random.seed(random_seed)
    random.shuffle(pivot_keys)

    fold_size = int(np.ceil(len(pivot_keys) / nfold))

    mapped_embs = {}
    src_keys = list(src_embs.keys())
    for k in src_keys:
        mapped_embs[k] = np.zeros([trg_embs.size])

    session = tf.Session()
    params = MapperParams(src_dim=src_embs.size,
                          trg_dim=trg_embs.size,
                          map_dim=trg_embs.size,
                          activation=activation,
                          num_layers=num_layers,
                          checkpoint_file=checkpoint_file)

    for i in range(nfold):
        log.writeln('  Starting fold %d/%d' % (i + 1, nfold))
        if random_seed:
            this_random = random_seed + i
        else:
            this_random = None
        model = ManifoldMapper(session, params, random_seed=this_random)

        fold_start, fold_end = (i * fold_size), ((i + 1) * fold_size)
        train_keys = pivot_keys[:fold_start]
        dev_keys = pivot_keys[fold_start:fold_end]
        train_keys.extend(pivot_keys[fold_end:])

        train(model,
              src_embs,
              trg_embs,
              train_keys,
              dev_keys,
              batch_size=batch_size)

        # get projections from this fold
        log.writeln('  Getting trained projections for fold %d' % (i + 1))
        log.track(message='    >> Projected {0}/%d keys' % len(src_keys),
                  writeInterval=10000)
        batch_start = 0
        while batch_start < len(src_keys):
            batch_keys = src_keys[batch_start:batch_start + project_batch_size]
            batch_src = np.array([src_embs[k] for k in batch_keys])
            batch_mapped = model.project_batch(batch_src)

            for i in range(batch_mapped.shape[0]):
                key = batch_keys[i]
                mapped_embs[key] += batch_mapped[i]
                log.tick()

            batch_start += project_batch_size
        log.flushTracker()

    # mean projections
    for k in src_keys:
        mapped_embs[k] /= nfold

    # get final MSE over full pivot set
    final_errors = []
    for k in pivot_keys:
        diff = mapped_embs[k] - trg_embs[k]
        final_errors.append(np.sum(diff**2) / 2)
    log.writeln('\nPivot error in final projections: %f' %
                np.mean(final_errors))

    return mapped_embs
コード例 #27
0
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' %
                      len(word_embeds))

        t_sub = log.startTimer('Reading entity definitions from %s...' %
                               options.entity_defnf,
                               newline=False)
        definitions = readDefinitions(options.entity_defnf)
        log.stopTimer(t_sub,
                      message='Read %d definitions ({0:.2f}s)' %
                      len(definitions))

        log.write('Constructing entity definition representations...')
        entity_defn_embeds = embedDefinitions(definitions, word_embeds)
        #del(word_embeds)
        log.writeln('Embedded %d entity definitions.' %
                    len(entity_defn_embeds))

        if options.entity_dualf:
            dual_embeds = pyemblib.Embeddings()
            for (k, v) in entity_defn_embeds.items():
                if k in entity_embeds:
                    dual_embeds[k] = np.concatenate([entity_embeds[k], v])
            log.writeln('Writing both versions of entity embeddings to %s...' %
                        options.entity_dualf)
            pyemblib.write(dual_embeds, options.entity_dualf)
            log.writeln('Wrote %d dual embeddings.' % len(dual_embeds))
    else:
        entity_defn_embeds = None

    if options.stringsf:
        t_sub = log.startTimer('Reading preferred strings from %s...' %
コード例 #28
0
ファイル: nn_saver.py プロジェクト: drgriffis/knn-embedding
            ('Ordered vocabulary file', options.vocabf),
            ('Number of nearest neighbors', options.k),
            ('Batch size', options.batch_size),
            ('Number of threads', options.threads),
            ('Partial nearest neighbors file for resuming',
             options.partial_neighbors_file),
        ], 'k Nearest Neighbor calculation with cosine similarity')

    t_sub = log.startTimer('Reading embeddings from %s...' % embf)
    emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace')
    log.stopTimer(t_sub,
                  message='Read {0:,} embeddings in {1}s.\n'.format(
                      len(emb), '{0:.2f}'))

    if not os.path.isfile(options.vocabf):
        log.writeln('Writing node ID <-> vocab map to %s...\n' %
                    options.vocabf)
        writeNodeMap(emb, options.vocabf)
    else:
        log.writeln('Reading node ID <-> vocab map from %s...\n' %
                    options.vocabf)
    node_map = readNodeMap(options.vocabf)

    # get the vocabulary in node ID order, and map index in emb_arr
    # to node IDs
    node_IDs = list(node_map.keys())
    node_IDs.sort()
    ordered_vocab = [node_map[node_ID] for node_ID in node_IDs]

    emb_arr = np.array([emb[v] for v in ordered_vocab])

    if options.partial_neighbors_file:
コード例 #29
0
    log.start(logfile=options.logfile)

    config = configparser.ConfigParser()
    config.read(options.config)

    analogy_file = datasets.getpath(options.dataset, config, eval_mode.ALL_INFO)

    configlogger.writeConfig(log, settings=[
        ('Config file', options.config),
        ('Dataset', options.dataset),
        ('Path to dataset', analogy_file),
        ('Lowercasing analogies', options.to_lower),
        ('Output vocab file', vocabf),
    ], title='Vocabulary extraction from analogy dataset')

    log.writeln('Reading %s analogies from %s...' % (options.dataset, analogy_file))
    analogies = parsers.parse(
        analogy_file,
        options.dataset,
        eval_mode.ALL_INFO,
        data_mode.String,
        to_lower=options.to_lower
    )
    log.writeln('Read {0:,} analogies in {1:,} relations.\n'.format(
        sum([len(anlg_set) for anlg_set in analogies.values()]),
        len(analogies)
    ))

    log.writeln('Extracting vocabulary...')
    vocab = set()
    for (_, anlg_set) in analogies.items():
コード例 #30
0
ファイル: learnmap.py プロジェクト: drgriffis/NeuralVecmap
                             mode=options.src_embf_mode,
                             lower_keys=True)
    log.stopTimer(t_sub,
                  message='Read %d embeddings in {0:.2f}s' % len(src_embs))

    t_sub = log.startTimer('Reading target embeddings from %s...' %
                           options.trg_embf,
                           newline=False)
    trg_embs = pyemblib.read(options.trg_embf,
                             mode=options.trg_embf_mode,
                             lower_keys=True)
    log.stopTimer(t_sub,
                  message='Read %d embeddings in {0:.2f}s' % len(trg_embs))

    pivots = readPivotsFile(options.pivotf, tolower=True)
    log.writeln('Loaded %d pivot terms.' % len(pivots))

    # double check that pivots are present in both embedding files
    validated_pivots = set()
    for pivot in pivots:
        if (not pivot in src_embs) or (not pivot in trg_embs):
            log.writeln(
                '[WARNING] Pivot term "%s" not found in at least one embedding set'
                % pivot)
        else:
            validated_pivots.add(pivot)

    # write the experimental configuration
    configlogger.writeConfig('%s.config' % options.checkpointf,
                             title='DNN embedding mapping experiment',
                             settings=[