def integerize(data): """ Integerize dataset returns a triple (label alphabet, feature alphabet, integerized dataset) """ F = Alphabet() L = Alphabet() I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data] return (L, F, I)
class Dataset(object): def __init__(self, train, dev, test): self.train = train self.dev = dev self.test = test # indexes will be populated by `_index`. self.Y = Alphabet() # tag set self.V = Alphabet() # vocabulary self.V_freq = Counter() # token unigram counts self.V2Y = defaultdict(set) # tag dictionary self.prefixes = Counter() self.suffixes = Counter() self._index(self.train) def _index(self, data): "frequency tables, etc." for sentence in data: for y, w in sentence: self.Y.add(y) self.V.add(w) self.V2Y[w].add(y) self.V_freq[w] += 1 for prefix in prefixes(w): self.prefixes[prefix] += 1 for suffix in suffixes(w): self.suffixes[suffix] += 1 def make_instances(self, fold, cls): "Convert tuples in data `fold` to instances of `cls`." data = [] for x in iterview(getattr(self, fold), msg='Features (%s)' % fold): tags, tokens = zip(*x) data.append(cls(tokens, self.Y.map(tags), self)) return data def tag_ngram_counts(self, n): "Returns tag ngram count for subsequences of length n." # Y = self.Y def tag_sequences(): """Iterate over tag sequence (as `str` instead of `int`, which is how they are stored.). """ for e in self.train: y, _ = zip(*e) # assert all(isinstance(yy, int) for yy in y), y # yield tuple(Y.lookup_many(y)) yield y return ngram_counts(tag_sequences(), n)
def integerize(data): """ Integerize dataset returns a triple (label alphabet, feature alphabet, integerized dataset) """ if do_label_count: label_count = defaultdict(int) for label, features in data: label_count[label] += 1 label_count = label_count.items() label_count.sort(key=lambda x: -x[1]) # sort by count print 'label count' for k,v in label_count: print '%20s => %s' % (k, v) sys.exit(0) F = Alphabet() L = Alphabet() I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data] return (L, F, I)
def integerize(data): """ Integerize dataset returns a triple (label alphabet, feature alphabet, integerized dataset) """ if do_label_count: label_count = defaultdict(int) for label, features in data: label_count[label] += 1 label_count = label_count.items() label_count.sort(key=lambda x: -x[1]) # sort by count print 'label count' for k, v in label_count: print '%20s => %s' % (k, v) sys.exit(0) F = Alphabet() L = Alphabet() I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data] return (L, F, I)
from arsenal.iterview import progress from arsenal.terminal import colors from collections import Counter, defaultdict from grafl.test import make_model_func from grafl.dataset.edge_dataset import BWD_dataset np.set_printoptions(precision=4) L = { 0: 'coordinate', 1: 'hypernym', 2: 'hyponym', } A = Alphabet() A.map([x.strip().split()[1] for i, x in enumerate(file('res/bowman_wordnet_longer_shuffled_synset_relations.map')) if i > 2]) tst = BWD_dataset('test').data trn = BWD_dataset('train').data trn_x = trn[0] trn_y = trn[1] seen = set(trn_x.flatten()) | set(trn_y.flatten()) X,Y,_ = tst X = list(A.lookup_many(X.flatten())) Y = list(A.lookup_many(Y.flatten())) #D = np.array([X,Y,L.flatten()]).T model_file = 'res/experiments/BWD-projection-Softmax_best.pkl' #model_file = '/home/timv/Downloads/BWD-projection-identity_sub_glue-Softmax.pkl'
from collections import Counter, defaultdict from grafl.test import make_model_func from grafl.dataset.edge_dataset import BWD_dataset np.set_printoptions(precision=4) L = { 0: 'coordinate', 1: 'hypernym', 2: 'hyponym', } A = Alphabet() A.map([ x.strip().split()[1] for i, x in enumerate( file('res/bowman_wordnet_longer_shuffled_synset_relations.map')) if i > 2 ]) tst = BWD_dataset('test').data trn = BWD_dataset('train').data trn_x = trn[0] trn_y = trn[1] seen = set(trn_x.flatten()) | set(trn_y.flatten()) X, Y, _ = tst X = list(A.lookup_many(X.flatten())) Y = list(A.lookup_many(Y.flatten())) #D = np.array([X,Y,L.flatten()]).T