def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir): totals = len(trnMentions) + len(devMentions) + len(tstMentions) targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): types_idx = [t2idx[t] for t in men.alltypes] targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx)) hdf5_file = dsdir + '_targets.h5py' f = h5py.File(hdf5_file, mode='w') targets = f.create_dataset('targets', targets_m.shape, dtype='int32') targets.attrs['type_to_ix'] = yaml.dump(t2idx) targets[...] = targets_m targets.dims[0].label = 'all_types' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'targets': (0, nsamples_train) }, 'dev': { 'targets': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'targets': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir): totals = len(trnMentions) + len(devMentions) + len(tstMentions) targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): types_idx = [t2idx[t] for t in men.alltypes] targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx)) hdf5_file = dsdir + '_targets.h5py' f = h5py.File(hdf5_file, mode='w') targets = f.create_dataset('targets', targets_m.shape, dtype='int32') targets.attrs['type_to_ix'] = yaml.dump(t2idx) targets[...] = targets_m targets.dims[0].label = 'all_types' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'targets': (0, nsamples_train)}, 'dev': {'targets': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'targets': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def build_type_patterns(trnMentions, t2idx, dsdir, vectorfile, upto=-1): dsdir += '_typeCooccurrMatrix.npy' pattern2freq = defaultdict(lambda: 0) for i, men in enumerate(trnMentions): pattern = [t2idx[t] for t in men.alltypes] vec = ' '.join([str(v) for v in cmn.convertTargetsToBinVec(pattern, len(t2idx))]) pattern2freq[vec] += 1 sorted_p2f = sorted(pattern2freq.items(), key=operator.itemgetter(1)) # max_pat = 300 label_cooccur_matrix = numpy.zeros((len(sorted_p2f), len(t2idx)), dtype='float32') for i, patternfreq in enumerate(sorted_p2f): pattern, freq = patternfreq pattern = numpy.asarray([int(p) for p in pattern.split(' ')]).astype('float32') # vec = cmn.convertTargetsToBinVec(pattern, len(t2idx)).astype('float32') pattern *= numpy.sqrt(6. / (len(pattern) + len(t2idx))) # print pattern label_cooccur_matrix[i] = pattern print len(label_cooccur_matrix) numpy.save(dsdir, label_cooccur_matrix)
def build_type_patterns(trnMentions, t2idx, dsdir, vectorfile, upto=-1): dsdir += '_typeCooccurrMatrix.npy' pattern2freq = defaultdict(lambda: 0) for i, men in enumerate(trnMentions): pattern = [t2idx[t] for t in men.alltypes] vec = ' '.join( [str(v) for v in cmn.convertTargetsToBinVec(pattern, len(t2idx))]) pattern2freq[vec] += 1 sorted_p2f = sorted(pattern2freq.items(), key=operator.itemgetter(1)) # max_pat = 300 label_cooccur_matrix = numpy.zeros((len(sorted_p2f), len(t2idx)), dtype='float32') for i, patternfreq in enumerate(sorted_p2f): pattern, freq = patternfreq pattern = numpy.asarray([int(p) for p in pattern.split(' ') ]).astype('float32') # vec = cmn.convertTargetsToBinVec(pattern, len(t2idx)).astype('float32') pattern *= numpy.sqrt(6. / (len(pattern) + len(t2idx))) # print pattern label_cooccur_matrix[i] = pattern print len(label_cooccur_matrix) numpy.save(dsdir, label_cooccur_matrix)