Example #1
0
def build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, use_lowercase=False, max_num_words=10, upto=None):
    if vectorfile == None:
        return
    word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_words = numpy.zeros(shape=(totals, max_num_words), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        input_words[i] = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
    logger.info('shape of subwords dataset: %s', input_words.shape)
    hdf5_file = dsdir + '_subwords.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('subwords', input_words.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(word_to_idx)
    features[...] = input_words
    features.dims[0].label = 'words'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'subwords': (0, nsamples_train)},
        'dev': {'subwords': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'subwords': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    logger.info('Building subwords dataset finished. It saved in: %s', hdf5_file)
    logger.info('writing subword embeddings')
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto)
    with h5py.File(dsdir + "_subwords_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=idx2embeddings)
        vectors.attrs['vectorsize'] = vectorsize
Example #2
0
def build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile=None, max_len_name=30):
    char_to_idx, idx_to_char = build_char_vocab(trnMentions) #train for characters because we only use entities names for characters
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name)
    print input_letters.shape
    fuelfile = dsdir +'_letters.h5py'
    f = h5py.File(fuelfile, mode='w')
    features = f.create_dataset('letters', input_letters.shape, dtype='int32')  # @UndefinedVariable
    features.attrs['voc2idx'] = yaml.dump(char_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_char, default_flow_style=False)
    features.attrs['vocabsize'] = len(char_to_idx)
    features[...] = input_letters
    features.dims[0].label = 'letters'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'letters': (0, nsamples_train)},
        'dev': {'letters': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'letters': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('building letters dataset finished. It saved in: %s', fuelfile)
    if vectorfile is None: 
        return
    embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=char_to_idx, num=-1)
    logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize
def build_letters_ds(trnMentions,
                     devMentions,
                     tstMentions,
                     t2idx,
                     dsdir,
                     vectorfile=None,
                     max_len_name=30):
    char_to_idx, idx_to_char = build_char_vocab(
        trnMentions
    )  #train for characters because we only use entities names for characters
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name)
    print input_letters.shape
    fuelfile = dsdir + '_letters.h5py'
    f = h5py.File(fuelfile, mode='w')
    features = f.create_dataset('letters', input_letters.shape,
                                dtype='int32')  # @UndefinedVariable
    features.attrs['voc2idx'] = yaml.dump(char_to_idx,
                                          default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_char,
                                          default_flow_style=False)
    features.attrs['vocabsize'] = len(char_to_idx)
    features[...] = input_letters
    features.dims[0].label = 'letters'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'letters': (0, nsamples_train)
        },
        'dev': {
            'letters': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'letters': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('building letters dataset finished. It saved in: %s', fuelfile)
    if vectorfile is None:
        return
    embeddings, vectorsize = read_embeddings_vocab(vectorfile,
                                                   vocab=char_to_idx,
                                                   num=-1)
    logger.info('size of embedding matrix to save is: (%d, %d)',
                embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors',
                                    compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize
Example #4
0
def build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorfile, use_lowercase=True, upto=None):
    if ent2tfidf_features_path == None:
        print "Warning: ignoring tfidf features building..."
        return
    ent2features = load_ent2features(ent2tfidf_features_path)
    word_to_idx, idx_to_word = build_voc_from_features(ent2features)
    logger.info('tfidf desc features vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_features = numpy.zeros(shape=(totals, len(ent2features.values()[0])), dtype='int32')
    ent_no_emb = 0
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        if men.entityId not in ent2features:
            ent_no_emb += 1
            continue
        features = ent2features[men.entityId]
        input_features[i] = get_ngram_seq(word_to_idx, features, max_len=input_features.shape[1])
    logger.info('shape of tfidf input dataset: %s', input_features.shape)
    logger.info('number of entities without embeddings: %d', ent_no_emb)
    hdf5_file = dsdir + '_desc_features.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('desc_features', input_features.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(word_to_idx)
    features[...] = input_features
    features.dims[0].label = 'description_features'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'desc_features': (0, nsamples_train)},
        'dev': {'desc_features': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'desc_features': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    
    logger.info('Building desc_features dataset finished. It saved in: %s', hdf5_file)
    logger.info('writing word embeddings')
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto)
    print "embeddings shape: ", idx2embeddings.shape
    with h5py.File(dsdir + "_desc_features_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=idx2embeddings)
        vectors.attrs['vectorsize'] = vectorsize
Example #5
0
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4):
    word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto)
    
    input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
        avgvec = numpy.zeros(shape=(vectorsize))
        for ii in seq_words:
            avgvec += idx2embeddings[ii]
        avgvec /= len(seq_words)
        input_avg[i] = avgvec
    
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix)
    logger.info(words_types_cosin_matrix.shape)
     
    dsdir += '_tcwords.h5py'
    f = h5py.File(dsdir, mode='w')
    features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1]
    features[...] = words_types_cosin_matrix
    features.dims[0].label = 'words_types_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tcwords': (0, nsamples_train)},
        'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
Example #6
0
def build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, ngram, max_num_ngrams=98, upto=-1):
    ngram_to_idx, idx_to_word, name2ngrams = build_ngram_vocab(trnMentions+devMentions+tstMentions,ngram=ngram, MIN_FREQ=5) #train for characters because we only use entities names for characters
    logger.info('ngram%d vocab size: %d', ngram, len(ngram_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_words = numpy.zeros(shape=(totals, max_num_ngrams), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        ngrams = name2ngrams[name]
        input_words[i] = get_ngram_seq(ngram_to_idx, ngrams, max_len=max_num_ngrams)
    print input_words.shape
    ngram_label = 'ngrams' + str(ngram)
    hdf5_file = dsdir + '_ngrams'+str(ngram)+'.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset(ngram_label, input_words.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(ngram_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(ngram_to_idx)
    features[...] = input_words
    features.dims[0].label = ngram_label
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {ngram_label: (0, nsamples_train)},
        'dev': {ngram_label: (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {ngram_label: (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    logger.info('Building ngram%d dataset finished. It saved in: %s', ngram, hdf5_file)
    if vectorfile is None or vectorfile == '': 
        return
    logger.info('Now, writing ngram embeddings')
    embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=ngram_to_idx, num=upto)
    logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_" + ngram_label + "_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize        
def build_type_words_cosine_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               t2idx,
                               dsdir,
                               vectorfile,
                               upto=-1,
                               max_num_words=4):
    word_to_idx, idx_to_word = build_word_vocab(
        trnMentions + devMentions + tstMentions
    )  #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)

    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile,
                                                       vocab=word_to_idx,
                                                       num=upto)

    input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
        avgvec = numpy.zeros(shape=(vectorsize))
        for ii in seq_words:
            avgvec += idx2embeddings[ii]
        avgvec /= len(seq_words)
        input_avg[i] = avgvec

    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    typevecmatrix = buildtypevecmatrix(
        t2idx, embeddings, vectorsize,
        voc2idx)  # a matrix with size: 102 * dim
    words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix)
    logger.info(words_types_cosin_matrix.shape)

    dsdir += '_tcwords.h5py'
    f = h5py.File(dsdir, mode='w')
    features = f.create_dataset('tcwords',
                                words_types_cosin_matrix.shape,
                                dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1]
    features[...] = words_types_cosin_matrix
    features.dims[0].label = 'words_types_cosine'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'tcwords': (0, nsamples_train)
        },
        'dev': {
            'tcwords': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'tcwords': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info(
        'Building types-words cosine (tcwords) dataset finished. It saved in: %s',
        dsdir)
def build_ngram_ds(trnMentions,
                   devMentions,
                   tstMentions,
                   t2idx,
                   dsdir,
                   vectorfile,
                   ngram,
                   max_num_ngrams=98,
                   upto=-1):
    ngram_to_idx, idx_to_word, name2ngrams = build_ngram_vocab(
        trnMentions + devMentions + tstMentions, ngram=ngram, MIN_FREQ=5
    )  #train for characters because we only use entities names for characters
    logger.info('ngram%d vocab size: %d', ngram, len(ngram_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_words = numpy.zeros(shape=(totals, max_num_ngrams), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        ngrams = name2ngrams[name]
        input_words[i] = get_ngram_seq(ngram_to_idx,
                                       ngrams,
                                       max_len=max_num_ngrams)
    print input_words.shape
    ngram_label = 'ngrams' + str(ngram)
    hdf5_file = dsdir + '_ngrams' + str(ngram) + '.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset(ngram_label, input_words.shape,
                                dtype='int32')  # @UndefinedVariable

    features.attrs['voc2idx'] = yaml.dump(ngram_to_idx,
                                          default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word,
                                          default_flow_style=False)
    features.attrs['vocabsize'] = len(ngram_to_idx)
    features[...] = input_words
    features.dims[0].label = ngram_label
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            ngram_label: (0, nsamples_train)
        },
        'dev': {
            ngram_label: (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            ngram_label: (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building ngram%d dataset finished. It saved in: %s', ngram,
                hdf5_file)
    if vectorfile is None or vectorfile == '':
        return
    logger.info('Now, writing ngram embeddings')
    embeddings, vectorsize = read_embeddings_vocab(vectorfile,
                                                   vocab=ngram_to_idx,
                                                   num=upto)
    logger.info('size of embedding matrix to save is: (%d, %d)',
                embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_" + ngram_label + "_embeddings.h5py",
                   mode='w') as fp:
        vectors = fp.create_dataset('vectors',
                                    compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize
def build_subwords_ds(trnMentions,
                      devMentions,
                      tstMentions,
                      t2idx,
                      dsdir,
                      vectorfile,
                      use_lowercase=False,
                      max_num_words=10,
                      upto=None):
    if vectorfile == None:
        return
    word_to_idx, idx_to_word = build_word_vocab(
        trnMentions + devMentions + tstMentions
    )  #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_words = numpy.zeros(shape=(totals, max_num_words), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        input_words[i] = get_ngram_seq(word_to_idx,
                                       words,
                                       max_len=max_num_words)
    logger.info('shape of subwords dataset: %s', input_words.shape)
    hdf5_file = dsdir + '_subwords.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('subwords', input_words.shape,
                                dtype='int32')  # @UndefinedVariable

    features.attrs['voc2idx'] = yaml.dump(word_to_idx,
                                          default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word,
                                          default_flow_style=False)
    features.attrs['vocabsize'] = len(word_to_idx)
    features[...] = input_words
    features.dims[0].label = 'words'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'subwords': (0, nsamples_train)
        },
        'dev': {
            'subwords': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'subwords': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building subwords dataset finished. It saved in: %s',
                hdf5_file)
    logger.info('writing subword embeddings')
    idx2embeddings, vectorsize = read_embeddings_vocab(
        vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto)
    with h5py.File(dsdir + "_subwords_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors',
                                    compression='gzip',
                                    data=idx2embeddings)
        vectors.attrs['vectorsize'] = vectorsize
def build_desc_features_ds(trnMentions,
                           devMentions,
                           tstMentions,
                           ent2tfidf_features_path,
                           t2idx,
                           dsdir,
                           vectorfile,
                           use_lowercase=True,
                           upto=None):
    if ent2tfidf_features_path == None:
        print "Warning: ignoring tfidf features building..."
        return
    ent2features = load_ent2features(ent2tfidf_features_path)
    word_to_idx, idx_to_word = build_voc_from_features(ent2features)
    logger.info('tfidf desc features vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_features = numpy.zeros(shape=(totals, len(ent2features.values()[0])),
                                 dtype='int32')
    ent_no_emb = 0
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        if men.entityId not in ent2features:
            ent_no_emb += 1
            continue
        features = ent2features[men.entityId]
        input_features[i] = get_ngram_seq(word_to_idx,
                                          features,
                                          max_len=input_features.shape[1])
    logger.info('shape of tfidf input dataset: %s', input_features.shape)
    logger.info('number of entities without embeddings: %d', ent_no_emb)
    hdf5_file = dsdir + '_desc_features.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('desc_features',
                                input_features.shape,
                                dtype='int32')  # @UndefinedVariable

    features.attrs['voc2idx'] = yaml.dump(word_to_idx,
                                          default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word,
                                          default_flow_style=False)
    features.attrs['vocabsize'] = len(word_to_idx)
    features[...] = input_features
    features.dims[0].label = 'description_features'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'desc_features': (0, nsamples_train)
        },
        'dev': {
            'desc_features': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'desc_features': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

    logger.info('Building desc_features dataset finished. It saved in: %s',
                hdf5_file)
    logger.info('writing word embeddings')
    idx2embeddings, vectorsize = read_embeddings_vocab(
        vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto)
    print "embeddings shape: ", idx2embeddings.shape
    with h5py.File(dsdir + "_desc_features_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors',
                                    compression='gzip',
                                    data=idx2embeddings)
        vectors.attrs['vectorsize'] = vectorsize