def __init__(self, filename): self.Y = Alphabet() data = list( fromSGML(filename, linegrouper="<NEW.*?>", bioencoding=False)) np.random.shuffle(data) super(CoraCitations, self).__init__(train=data[len(data) // 5:], dev=data[:len(data) // 5], test=[]) self.train = self.make_instances('train', Instance) self.dev = self.make_instances('dev', Instance)
def get_data(f): for x in fromSGML(f, linegrouper="<NEW.*?>", bioencoding=False): x, y = zip(*[(Token(w), y) for y, w in x]) preprocessing(x) yield Instance(x, truth=y)