Exemple #1
0
def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1):
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in voc2idx:
            entvec = embeddings[voc2idx[mye]]
        input_entvec[i] = entvec
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix)
    logger.info(ent_types_cosin_matrix.shape)
    
    hdf5_file += '_tc.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1]
    features[...] = ent_types_cosin_matrix
    features.dims[0].label = 'types_ent_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tc': (0, nsamples_train)},
        'dev': {'tc': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tc': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
def build_typecosine_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        hdf5_file,
                        vectorfile,
                        upto=-1):
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in voc2idx:
            entvec = embeddings[voc2idx[mye]]
        input_entvec[i] = entvec
    typevecmatrix = buildtypevecmatrix(
        t2idx, embeddings, vectorsize,
        voc2idx)  # a matrix with size: 102 * dim
    ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix)
    logger.info(ent_types_cosin_matrix.shape)

    hdf5_file += '_tc.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('tc',
                                ent_types_cosin_matrix.shape,
                                dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1]
    features[...] = ent_types_cosin_matrix
    features.dims[0].label = 'types_ent_cosine'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'tc': (0, nsamples_train)
        },
        'dev': {
            'tc': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'tc': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info(
        'Building types-ent cosine (tc) dataset finished. It saved in: %s',
        hdf5_file)
Exemple #3
0
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4):
    word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto)
    
    input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
        avgvec = numpy.zeros(shape=(vectorsize))
        for ii in seq_words:
            avgvec += idx2embeddings[ii]
        avgvec /= len(seq_words)
        input_avg[i] = avgvec
    
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix)
    logger.info(words_types_cosin_matrix.shape)
     
    dsdir += '_tcwords.h5py'
    f = h5py.File(dsdir, mode='w')
    features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1]
    features[...] = words_types_cosin_matrix
    features.dims[0].label = 'words_types_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tcwords': (0, nsamples_train)},
        'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
def build_type_words_cosine_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               t2idx,
                               dsdir,
                               vectorfile,
                               upto=-1,
                               max_num_words=4):
    word_to_idx, idx_to_word = build_word_vocab(
        trnMentions + devMentions + tstMentions
    )  #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)

    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile,
                                                       vocab=word_to_idx,
                                                       num=upto)

    input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
        avgvec = numpy.zeros(shape=(vectorsize))
        for ii in seq_words:
            avgvec += idx2embeddings[ii]
        avgvec /= len(seq_words)
        input_avg[i] = avgvec

    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    typevecmatrix = buildtypevecmatrix(
        t2idx, embeddings, vectorsize,
        voc2idx)  # a matrix with size: 102 * dim
    words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix)
    logger.info(words_types_cosin_matrix.shape)

    dsdir += '_tcwords.h5py'
    f = h5py.File(dsdir, mode='w')
    features = f.create_dataset('tcwords',
                                words_types_cosin_matrix.shape,
                                dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1]
    features[...] = words_types_cosin_matrix
    features.dims[0].label = 'words_types_cosine'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'tcwords': (0, nsamples_train)
        },
        'dev': {
            'tcwords': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'tcwords': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info(
        'Building types-words cosine (tcwords) dataset finished. It saved in: %s',
        dsdir)