def __getitem__(self, idx): batch_index = np.arange(idx * self.batch_size, min(self.size, (idx + 1) * self.batch_size)) df = self.df.iloc[batch_index] data_seq = np.zeros((len(df), MAXLEN, 21), dtype=np.float32) data_gos = np.zeros((len(df), len(self.gos_dict)), dtype=np.float32) data_exp = np.zeros((len(df), 53), dtype=np.float32) labels = np.zeros((len(df), len(self.terms_dict)), dtype=np.int32) for i, row in enumerate(df.itertuples()): data_seq[i, :] = to_onehot(row.sequences) data_exp[i, :] = row.expressions for item in row.deepgo_annotations: t_id, score = item.split('|') if t_id in self.gos_dict: data_gos[i, self.gos_dict[t_id]] = float(score) for t_id in row.iea_annotations: if t_id in self.gos_dict: data_gos[i, self.gos_dict[t_id]] = 1 for t_id in row.go_annotations: if t_id in self.gos_dict: data_gos[i, self.gos_dict[t_id]] = 1 for t_id in row.hp_annotations: if t_id in self.terms_dict: labels[i, self.terms_dict[t_id]] = 1 data = np.concatenate([data_gos, data_exp], axis=1) return (data, labels)
def __getitem__(self, idx): batch_index = np.arange(idx * self.batch_size, min(self.size, (idx + 1) * self.batch_size)) df = self.df.iloc[batch_index] data_onehot = np.zeros((len(df), MAXLEN, 21), dtype=np.float32) labels = np.zeros((len(df), self.nb_classes), dtype=np.int32) for i, row in enumerate(df.itertuples()): seq = row.sequences onehot = to_onehot(seq) data_onehot[i, :, :] = onehot for t_id in row.prop_annotations: if t_id in self.terms_dict: labels[i, self.terms_dict[t_id]] = 1 self.start += self.batch_size print(data_onehot, labels) return (data_onehot, labels)
def next(self): if self.start < self.size: batch_index = np.arange( self.start, min(self.size, self.start + self.batch_size)) df = self.df.iloc[batch_index] data_onehot = np.zeros((len(df), MAXLEN, 21), dtype=np.int32) labels = np.zeros((len(df), self.nb_classes), dtype=np.int32) for i, row in enumerate(df.itertuples()): seq = row.sequences onehot = to_onehot(seq) data_onehot[i, :, :] = onehot for t_id in row.prop_annotations: if t_id in self.terms_dict: labels[i, self.terms_dict[t_id]] = 1 self.start += self.batch_size return (data_onehot, labels) else: self.reset() return self.next()
def get_data(sequences): pred_seqs = [] ids = [] for i, seq in enumerate(sequences): if len(seq) > MAXLEN: st = 0 while st < len(seq): pred_seqs.append(seq[st: st + MAXLEN]) ids.append(i) st += MAXLEN - 128 else: pred_seqs.append(seq) ids.append(i) n = len(pred_seqs) data = np.zeros((n, MAXLEN, 21), dtype=np.float32) for i in range(n): seq = pred_seqs[i] data[i, :, :] = to_onehot(seq) return ids, data