Ejemplo n.º 1
0
def violin_plot(dataset):
    data = Gene_Wrapper.seq_data_loader(False, dataset)
    X = [len(gene.seq) for gene in data]
    y = np.array([np.array(gene.dist) / sum(gene.dist) for gene in data])

    def set_axis_style(ax, labels):
        ax.get_xaxis().set_tick_params(direction='out')
        ax.xaxis.set_ticks_position('bottom')
        ax.set_xticks(np.arange(1, len(labels) + 1))
        ax.set_xticklabels(labels)
        ax.set_xlim(0.25, len(labels) + 0.75)
        ax.set_ylabel('Localization values')

    fig = plt.figure(figsize=(12, 12))
    plt.violinplot([y[:, i] for i in range(4)], showmeans=True)
    ax = fig.axes[0]
    if dataset == "cefra-seq":
        locations = ["cytosol", "insoluble", "membrane", "nucleus"]
    elif dataset == "apex-rip":
        locations = ['KDEL', 'Mito', 'NES', 'NLS']
    else:
        raise RuntimeError('No such dataset')
    set_axis_style(ax, locations)
    plt.xticks(rotation=-20)
    plt.savefig('Graph/violin_{}.png'.format(dataset))
Ejemplo n.º 2
0
def preprocess_data(dataset):
    gene_data = Gene_Wrapper.seq_data_loader(True,
                                             dataset,
                                             0,
                                             4000,
                                             permute=False)

    X_seq = pad_sequences(
        [[seq_encoding_keys.index(c.upper()) for c in gene.seq]
         for gene in gene_data],
        maxlen=4000,
        dtype=np.int8,
        value=seq_encoding_keys.index('UNK'))  # , truncating='post')
    X_ann = pad_sequences(
        [[annotation_encoding_keys.index(a.upper()) for a in gene.ann]
         for gene in gene_data],
        maxlen=4000,
        dtype=np.int8,
        value=annotation_encoding_keys.index('UNK'))  # , truncating='post')
    y = np.array([label_dist(gene.dist) for gene in gene_data])

    from sklearn.model_selection import KFold

    kf = KFold(n_splits=10, shuffle=True, random_state=1234)
    folds = kf.split(X_seq, y)
    return X_seq, X_ann, y, folds
Ejemplo n.º 3
0
def preprocess_data(left, right, dataset):
    gene_data = Gene_Wrapper.load_sequence(dataset, left, right)

    print('padding and indexing data')
    encoding_keys = seq_encoding_keys
    encoding_vectors = seq_encoding_vectors
    X_left = pad_sequences([[encoding_keys.index(c) for c in gene.seqleft]
                            for gene in gene_data],
                           maxlen=left,
                           dtype=np.int8,
                           value=encoding_keys.index('UNK'),
                           padding='post')  #padding after sequence

    X_right = pad_sequences([[encoding_keys.index(c) for c in gene.seqright]
                             for gene in gene_data],
                            maxlen=right,
                            dtype=np.int8,
                            value=encoding_keys.index('UNK'),
                            padding='pre')  # padding before sequence

    print("X_left shape is " + str(X_left.shape))
    print("X_right shape is " + str(X_right.shape))
    X = np.concatenate([X_left, X_right], axis=-1)
    print("X shape is " + str(X.shape))
    y = np.array([label_dist(gene.dist) for gene in gene_data])

    mask_label_left = np.array([
        np.concatenate(
            [np.ones(len(gene.seqleft)),
             np.zeros(left - len(gene.seqleft))]) for gene in gene_data
    ],
                               dtype='float32')
    mask_label_right = np.array([
        np.concatenate([
            np.zeros(right - len(gene.seqright)),
            np.ones(len(gene.seqright))
        ]) for gene in gene_data
    ],
                                dtype='float32')
    mask_label = np.concatenate([mask_label_left, mask_label_right], axis=-1)

    print("training shapes" + str(X.shape) + " " + str(y.shape))
    print("Example y is " + str(y[0, :]))
    return X, y, mask_label, encoding_keys, encoding_vectors
Ejemplo n.º 4
0
    return np.array(dist) / np.sum(dist)


encoding_seq = OrderedDict([
    ('UNK', [0, 0, 0, 0]),
    ('A', [1, 0, 0, 0]),
    ('C', [0, 1, 0, 0]),
    ('G', [0, 0, 1, 0]),
    ('T', [0, 0, 0, 1]),
    ('N', [0.25, 0.25, 0.25, 0.25]),  # A or C or G or T
])
encoding_keys = list(encoding_seq.keys())
encoding_vectors = np.array(list(encoding_seq.values()))
reverse_mapping = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'U'}

gene_data = Gene_Wrapper.seq_data_loader(False, 'cefra-seq', 0, np.inf)

X = np.array([
    np.array([encoding_keys.index(c) for c in gene.seq]) for gene in gene_data
])
Y = np.array([label_dist(gene.dist) for gene in gene_data])


def cnn_bilstm_model(pooling_size=3,
                     nb_filters=32,
                     filters_length=10,
                     lstm_units=32,
                     attention_size=50):
    '''build model'''
    input = Input(shape=(None, ), dtype='int8')
    embedding_layer = Embedding(len(encoding_vectors),
Ejemplo n.º 5
0
encoding_annotation = OrderedDict([
    ('UNK', [0, 0, 0, 0, 0, 0]),  # for padding use
    ('f', [1, 0, 0, 0, 0, 0]),  # 'dangling start',
    ('t', [0, 1, 0, 0, 0, 0]),  # dangling end',
    ('i', [0, 0, 1, 0, 0, 0]),  # 'internal loop',
    ('h', [0, 0, 0, 1, 0, 0]),  # 'hairpin loop',
    ('m', [0, 0, 0, 0, 1, 0]),  # 'multi loop',
    ('s', [0, 0, 0, 0, 0, 1])  # 'stem'
])

seq_encoding_keys = list(encoding_seq.keys())
seq_encoding_vectors = np.array(list(encoding_seq.values()))
annotation_encoding_keys = list(encoding_annotation.keys())
annotation_encoding_vectors = np.array(list(encoding_annotation.values()))

gene_data = Gene_Wrapper.seq_data_loader(False, False, 0, 4000)
encoding_keys = seq_encoding_keys
encoding_vectors = seq_encoding_vectors
X = pad_sequences([[encoding_keys.index(c) for c in gene.seq]
                   for gene in gene_data],
                  maxlen=4000,
                  dtype=np.int8,
                  value=encoding_keys.index('UNK'))  # , truncating='post')
y = np.array([label_dist(gene.dist) for gene in gene_data])
ids = np.array([gene.id for gene in gene_data])
true_length = np.array([len(gene.seq) for gene in gene_data])

if args.saved_expr == "":
    print('New experiment')
    OUTPATH = os.path.join(
        basedir, 'Results', 'cefra-seq', 'SGDModel-10foldcv',
Ejemplo n.º 6
0
def preprocess_data(lower_bound,
                    upper_bound,
                    use_annotations,
                    dataset,
                    max_len,
                    randomization_test=False):
    gene_data = Gene_Wrapper.seq_data_loader(use_annotations,
                                             dataset,
                                             lower_bound,
                                             upper_bound,
                                             permute=randomization_test)

    print('padding and indexing data')
    if use_annotations:
        print(
            'Using unified one-hot encoding for both sequence and annotation features'
        )
        '''create unifed encoding scheme'''
        template = [0] * 24  # dim([a,c,g,t]) * dim([f,t,i,h,m,s])
        combined_encoding = OrderedDict()
        combined_encoding['UNK'] = template
        for i, (key_seq, key_ann) in enumerate(
                itertools.product(['A', 'C', 'G', 'T', 'N'],
                                  ['F', 'T', 'I', 'H', 'M', 'S'])):
            tmp = template.copy()
            if key_seq == 'N':
                for n in ['A', 'C', 'G', 'T']:
                    tmp[np.nonzero(combined_encoding[n +
                                                     key_ann])[0][0]] = 0.25
                combined_encoding[key_seq + key_ann] = tmp
            else:
                tmp[i] = 1  # normal one-hot encoding as it is...
                combined_encoding[key_seq + key_ann] = tmp
        encoding_keys = list(combined_encoding.keys())
        encoding_vectors = np.array(list(combined_encoding.values()))

        print('padding and indexing data')
        X = pad_sequences([[
            encoding_keys.index(s.upper() + a.upper())
            for s, a in zip(gene.seq, gene.ann)
        ] for gene in gene_data],
                          maxlen=max_len,
                          dtype=np.int8,
                          value=encoding_keys.index('UNK'))
        y = np.array([label_dist(gene.dist) for gene in gene_data])
    else:
        encoding_keys = seq_encoding_keys
        encoding_vectors = seq_encoding_vectors
        X = pad_sequences(
            [[encoding_keys.index(c) for c in gene.seq] for gene in gene_data],
            maxlen=max_len,
            dtype=np.int8,
            value=encoding_keys.index('UNK'))  # , truncating='post')
        y = np.array([label_dist(gene.dist) for gene in gene_data])

    global gene_ids
    gene_ids = np.array([gene.id for gene in gene_data])
    from sklearn.model_selection import KFold, StratifiedKFold

    # '''lame kfolds splitting'''
    # length = len(X)
    # fold_split_index = []
    # folds_X = []
    # folds_y = []
    # for i in range(1,10):
    #     fold_split_index.append(int(length*i/10)) # index: 0~8
    # for i in range(10):
    #     if i == 0:
    #         folds_X.append(X[:fold_split_index[0], :])
    #         folds_y.append(y[:fold_split_index[0], :])
    #     elif i == 9:
    #         folds_X.append(X[fold_split_index[8]:, :])
    #         folds_y.append(y[fold_split_index[8]:, :])
    #     else:
    #         folds_X.append(X[fold_split_index[i-1]:fold_split_index[i], :])
    #         folds_y.append(y[fold_split_index[i-1]:fold_split_index[i], :])
    #
    # return folds_X, folds_y, encoding_keys, encoding_vectors
    '''sklearn kfolds splitting'''
    kf = KFold(n_splits=10, shuffle=True, random_state=1234)
    folds = kf.split(X, y)
    # kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    # modes = []
    # for label in y:
    #     modes.append(np.argmax(label))
    # folds = kf.split(X, modes)
    return X, y, folds, encoding_keys, encoding_vectors
Ejemplo n.º 7
0
                    type=str,
                    default='cefra-seq',
                    choices=['cefra-seq', 'apex-rip'],
                    help='choose from cefra-seq and apex-rip')
parser.add_argument('--model',
                    type=str,
                    default='cnn_bilstm',
                    choices=['cnn', 'cnn_bilstm', 'resnet'],
                    help='')
parser.add_argument('--message', type=str, default="", help='')
parser.add_argument('--epochs', type=int, default=100, help='')
args = parser.parse_args()

# no clipping, no padding
gene_data = Gene_Wrapper.seq_data_loader(False,
                                         args.dataset,
                                         lower_bound=0,
                                         upper_bound=np.inf)

X = np.array([[encoding_keys.index(c) for c in gene.seq]
              for gene in gene_data])
y = np.array([label_dist(gene.dist) for gene in gene_data])
kf = KFold(n_splits=10, shuffle=True, random_state=1234)
folds = kf.split(X, y)

if args.dataset == "cefra-seq":
    locations = ['KDEL', 'Mito', 'NES', 'NLS']
elif args.dataset == "apex-rip":
    locations = ["cytoplasm", "insoluble", "membrane", "nucleus"]
else:
    raise RuntimeError('No such dataset')
'''prepare extract path'''