Ejemplo n.º 1
0
def export(args):
    raw.run(args)
Ejemplo n.º 2
0
def test_chiron_input():
    DATA_FORMAT = np.dtype([('start','<i4'),
                            ('length','<i4'),
                            ('base','S1')]) 
    ### Generate dummy dataset and check input ###
    dummy_dir = './Dummy_data/'
    if not os.path.isdir(dummy_dir):
        os.makedirs(dummy_dir)
    dummy_fast5 = os.path.join(dummy_dir,'fast5s')
    if not os.path.isdir(dummy_fast5):
        os.makedirs(dummy_fast5)
    file_num = 10
    base_signal = {'A':100,'C':200,'G':300,'T':400}
    bases = ['A','C','G','T']
    for i in range(file_num):
        file_n = os.path.join(dummy_fast5,'dummy_' + str(i) + '.fast5')
        length = np.random.randint(40000,50000)
        start = 0
        start_list = []
        length_list = []
        base_list = []
        raw_signal = []
        while start < length-1:
            start_list.append(start)
            step = min(length-start-1, np.random.randint(5,150))
            length_list.append(step)
            start = start + step
            base = bases[np.random.randint(len(bases))]
            base_list.append(base)
            raw_signal = raw_signal + [base_signal[base]] + [base_signal[base]-1]*(step-1)
        event_matrix = np.asarray(list(zip(start_list,length_list,base_list)),dtype = DATA_FORMAT)
        with h5py.File(file_n,'w') as root:
            if '/Raw' in root:
                del root['/Raw']
            raw_h = root.create_dataset('/Raw/Reads/Read_'+ str(i)+'/Signal',
                                        shape = (len(raw_signal),),
                                        dtype = np.int16)
            channel_h=root.create_dataset('/UniqueGlobalKey/channel_id/',shape=[],dtype=np.int16)
            channel_h.attrs['offset']=0
            channel_h.attrs['range']=1
            channel_h.attrs['digitisation']=1
            raw_h[...] = raw_signal[::-1]
            if '/Analyses' in root:
                del root['/Analyses']
            event_h = root.create_dataset('/Analyses/Corrected_000/BaseCalled_template/Events', 
                                          shape = (len(event_matrix),),
                                          maxshape=(None,),
                                          dtype = DATA_FORMAT)
            event_h[...] = event_matrix
            event_h.attrs['read_start_rel_to_raw'] = 0
            
    class Args(object):
        def __init__(self):
            self.input = dummy_fast5
            self.output = dummy_dir
            self.basecall_group = 'Corrected_000'
            self.mode = 'rna'
            self.tffile = 'train.tfrecords'
            self.basecall_subgroup = 'BaseCalled_template'
            self.unit=True
            
    from chiron.utils import raw
    args = Args()
    raw.run(args)
    train = read_tfrecord(dummy_dir,"train.tfrecords",seq_length=1000,h5py_file_path=os.path.join(dummy_dir,'cache.fast5'))
    
    for i in range(100):
        inputX, sequence_length, label = train.next_batch(10,shuffle=False)
        accum_len = 0
        for idx,x in enumerate(inputX):
            x = inputX[idx][:sequence_length[idx]]
            y = list()
            for x_idx, signal in enumerate(x):
                if x_idx==0:
                    y.append(signal)
                else:
                    if (abs(signal - x[x_idx-1]) >2) or (signal > x[x_idx-1]):
                        y.append(signal)
            corr = np.corrcoef(y, label[1][accum_len:accum_len + len(y)])[0, 1]
            for loc in label[0][accum_len:accum_len + len(y)]:
                
                assert(loc[0] == idx)
            accum_len += len(y)
            assert abs(corr - 1)< 1e-6
    print("Input pipeline dummy data test passed!")