def main(): args = parser.parse_args() first_in_fn = args.input[0] with MAPPED_SIGNAL_READER(first_in_fn) as hin: # Copy alphabet and modification information from first file alph_info = alphabet.AlphabetInfo(*hin.get_alphabet_information()) reads_written = set() print("Writing reads to ", args.output) with MAPPED_SIGNAL_WRITER(args.output, alph_info) as hout: for infile in args.input: copied_from_this_file = 0 with MAPPED_SIGNAL_READER(infile) as hin: check_version(hin, infile) in_alph_info = alphabet.AlphabetInfo(*hin.get_alphabet_information()) if not alph_info.equals(in_alph_info): raise Exception( "Alphabet info in {} differs from that in {}".format( infile, first_in_fn)) for read_id in hin.get_read_ids(): if read_id in reads_written: print("* Read", read_id, "already present: not copying from", infile) else: readObject = hin.get_read(read_id) readObject['read_id']=read_id hout.write_read(readObject) reads_written.add(read_id) copied_from_this_file += 1 print("Copied", copied_from_this_file, "reads from", infile) print("Copied", len(reads_written), "reads in total")
def _load_data(args, log): if args.input_strand_list is not None: read_ids = list(set(helpers.get_read_ids(args.input_strand_list))) log.write('* Will train from a subset of {} strands, determined ' + 'by read_ids in input strand list\n'.format(len(read_ids))) else: log.write('* Will train from all strands\n') read_ids = 'all' if args.limit is not None: log.write('* Limiting number of strands to {}\n'.format(args.limit)) with mapped_signal_files.HDF5Reader(args.input) as per_read_file: (bases_alphabet, collapse_alphabet, mod_long_names) = per_read_file.get_alphabet_information() read_data = per_read_file.get_multiple_reads(read_ids, max_reads=args.limit) # read_data now contains a list of reads # (each an instance of the Read class defined in # mapped_signal_files.py, based on dict) log.write('* Loaded {} reads.\n'.format(len(read_data))) alphabet_info = alphabet.AlphabetInfo(bases_alphabet, collapse_alphabet, mod_long_names, do_reorder=False) log.write('* Using alphabet definition: {}\n'.format(str(alphabet_info))) return read_data, alphabet_info
def test_check_HDF5_mapped_read_file(self): """Check that constructing a read object which doesn't conform leads to errors. """ print("Creating flawed Read object from test data") read_dict = construct_mapped_read() read_dict['Reference'] = "I'm not a numpy array!" # Wrong type! read_object = mapped_signal_files.Read(read_dict) print("Checking contents") check_text = read_object.check() print("Check result on read object: should fail") print(check_text) self.assertNotEqual(check_text, "pass") print("Writing to file") alphabet_info = alphabet.AlphabetInfo(DEFAULT_ALPHABET, DEFAULT_ALPHABET) with mapped_signal_files.HDF5Writer(self.testfilepath, alphabet_info) as f: f.write_read(read_object) print("Current dir = ", os.getcwd()) print("File written to ", self.testfilepath) print("\nOpening file for reading") with mapped_signal_files.HDF5Reader(self.testfilepath) as f: ids = f.get_read_ids() print("Read ids=", ids[0]) print("Version number = ", f.version) self.assertEqual(ids[0], read_dict['read_id']) file_test_report = f.check() print("Test report (should fail):", file_test_report) self.assertNotEqual(file_test_report, "pass")
def get_alphabet_info(model_info): flat_alphabet = model_info.output_alphabet[0] can_base = model_info.output_alphabet[0] for base in model_info.output_alphabet[1:]: if base in model_info.can_alphabet: can_base = base flat_alphabet += can_base mod_long_names = [] if len(model_info.mod_long_names) == 0 else \ list(zip(*model_info.mod_long_names))[1] return alphabet.AlphabetInfo(model_info.output_alphabet, flat_alphabet, mod_long_names, do_reorder=True)
def main(): """Main function to process mapping for each read using functions in prepare_mapping_funcs""" args = parser.parse_args() print("Running prepare_mapping using flip-flop remapping") if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) # Create alphabet and check for consistency modified_bases = [elt[0] for elt in args.mod] canonical_bases = [elt[1] for elt in args.mod] for b in modified_bases: assert len( b ) == 1, "Modified bases must be a single character, got {}".format(b) assert b not in args.alphabet, "Modified base must not be a canonical base, got {}".format( b) for b in canonical_bases: assert len( b ) == 1, "Canonical coding for modified bases must be a single character, got {}".format( b) assert b in args.alphabet, "Canonical coding for modified base must be a canonical base, got {}".format( b) full_alphabet = args.alphabet + ''.join(modified_bases) flat_alphabet = args.alphabet + ''.join(canonical_bases) modification_names = [elt[2] for elt in args.mod] alphabet_info = alphabet.AlphabetInfo(full_alphabet, flat_alphabet, modification_names, do_reorder=True) print("Converting references to labels using {}".format( str(alphabet_info))) # Make an iterator that yields all the reads we're interested in. fast5_reads = fast5utils.iterate_fast5_reads( args.input_folder, limit=args.limit, strand_list=args.input_strand_list, recursive=args.recursive) # Set up arguments (kwargs) for the worker function for each read kwargs = {} kwargs[ 'per_read_params_dict'] = prepare_mapping_funcs.get_per_read_params_dict_from_tsv( args.input_per_read_params) kwargs['model'] = helpers.load_model(args.model) kwargs['alphabet_info'] = alphabet_info kwargs['max_read_length'] = args.max_read_length kwargs['localpen'] = args.localpen # remaps a single read using flip-flip network workerFunction = prepare_mapping_funcs.oneread_remap def iter_jobs(): references = bio.fasta_file_to_dict(args.references, alphabet=full_alphabet) for fn, read_id in fast5_reads: yield fn, read_id, references.get(read_id, None) if args.limit is not None: chunksize = args.limit // (2 * args.jobs) chunksize = int(np.clip(chunksize, 1, 50)) else: chunksize = 50 results = imap_mp(workerFunction, iter_jobs(), threads=args.jobs, fix_kwargs=kwargs, unordered=True, chunksize=chunksize) # results is an iterable of dicts # each dict is a set of return values from a single read prepare_mapping_funcs.generate_output_from_results(results, args.output, alphabet_info)
parser.add_argument('--max',type=int,default=None) parser.add_argument('--calibration', action="store_true" ) args = parser.parse_args() """ with open(args.training_file,"r") as f: training_file = json.loads("".join(f.readlines())) """ dataset = pd.read_csv(args.dataset,sep=";") #print("data",dataset) #print(dataset.columns) print(args.alphabet,args.collapsed_alphabet) alpha = alphabet.AlphabetInfo(args.alphabet, args.collapsed_alphabet,args.mod_long_names) with MAPPED_SIGNAL_WRITER(args.output, alpha) as hout: for index, row in dataset.iterrows(): print(row) new_alphabet = row["new_alphabet"] mod_long_names = row["mod_long_names"] canonical = row.get("canonical","T") threshold = row["threshold"] filter_section = row["filter_section"] lower_threshold = row.get("lower_threshold",False) #Under set to 0 higher_threshold = row.get("higher_threshold",False)#higher set to higvalue
def parse_sublayer(sublayer): # TODO apply additional attributes (e.g. has_bias, convolutional padding) if sublayer['type'] == 'convolution': if sublayer['activation'] != 'tanh': sys.stderr.write(( 'Incompatible convolutional layer activation fucntion ' + '({}) encountered.\n').format(sublayer['type'])) sys.exit(1) sys.stderr.write(( 'Loading convolutional layer with attributes:\n\tin size: {}\n' + '\tout size: {}\n\twinlen: {}\n\tstride: {}\n').format( sublayer['insize'], sublayer['size'], sublayer['winlen'], sublayer['stride'])) layer = Convolution( sublayer['insize'], sublayer['size'], sublayer['winlen'], stride=sublayer['stride'], fun=tanh) elif sublayer['type'] == 'LSTM': sys.stderr.write(( 'Loading LSTM layer with attributes:\n\tin size: {}\n' + '\tout size: {}\n').format( sublayer['insize'], sublayer['size'])) layer = Lstm(sublayer['insize'], sublayer['size']) elif sublayer['type'] == 'GruMod': sys.stderr.write(( 'Loading GRU layer with attributes:\n\tin size: {}\n' + '\tout size: {}\n').format( sublayer['insize'], sublayer['size'])) layer = GruMod(sublayer['insize'], sublayer['size']) elif sublayer['type'] == 'reverse': sublayer = sublayer['sublayers'] if sublayer['type'] == 'GruMod': sys.stderr.write(( 'Loading Reverse GRU layer with attributes:\n\tin size: {}\n' + '\tout size: {}\n').format( sublayer['insize'], sublayer['size'])) layer = Reverse(GruMod(sublayer['insize'], sublayer['size'])) elif sublayer['type'] == 'LSTM': sys.stderr.write(( 'Loading Reverse LSTM layer with attributes:\n' + '\tin size: {}\n\tout size: {}\n').format( sublayer['insize'], sublayer['size'])) layer = Reverse(Lstm(sublayer['insize'], sublayer['size'])) else: sys.stderr.write(( 'Invalid reversed-time layer type ({})\n').format( sublayer['type'])) sys.exit(1) elif sublayer['type'] == 'GlobalNormTwoState': nbase = nbase_flipflop(sublayer['size']) sys.stderr.write(( 'Loading flip-flop layer with attributes:\n\tin size: {}\n' + '\tnbases: {}\n').format(sublayer['insize'], nbase)) layer = GlobalNormFlipFlop(sublayer['insize'], nbase) elif sublayer['type'] == 'GlobalNormTwoStateCatMod': output_alphabet = sublayer['output_alphabet'] curr_can_base = 0 collapse_alphabet = '' for can_i_nmod in sublayer['can_nmods']: collapse_alphabet += output_alphabet[curr_can_base] * ( can_i_nmod + 1) curr_can_base += can_i_nmod + 1 alphabet_info = alphabet.AlphabetInfo( output_alphabet, collapse_alphabet, sublayer['modified_base_long_names'], do_reorder=False) sys.stderr.write(( 'Loading modified bases flip-flop layer with attributes:\n' + '\tin size: {}\n\tmod bases: {}\n').format( sublayer['insize'], alphabet_info.mod_long_names)) layer = GlobalNormFlipFlopCatMod(sublayer['insize'], alphabet_info) else: sys.stderr.write('Encountered invalid layer type ({}).\n'.format( sublayer['type'])) sys.exit(1) layer = set_params(layer, sublayer['params'], sublayer['type']) return layer
def get_alphabet_information(self): mod_long_names = self.hdf5.attrs['mod_long_names'].splitlines() return alphabet.AlphabetInfo(self.hdf5.attrs['alphabet'], self.hdf5.attrs['collapse_alphabet'], mod_long_names)
def get_alphabet_info(output_alphabet, collapse_alphabet, mod_long_names): return alphabet.AlphabetInfo(output_alphabet, collapse_alphabet, mod_long_names, do_reorder=True)
def test_HDF5_mapped_read_file(self): """Test that we can save a mapped read file and open it again Also produces a plot for diagnostic purposes """ print("Creating Read object from test data") read_dict = construct_mapped_read_dict() read_object = signal_mapping.SignalMapping(**read_dict) print("Checking contents") check_text = read_object.check() print("Check result on read object:") print(check_text) self.assertEqual(check_text, "pass") print("Writing to file") with tempfile.NamedTemporaryFile(delete=False, dir=self.testset_work_dir) as fh: testfilepath = fh.name alphabet_info = alphabet.AlphabetInfo(DEFAULT_ALPHABET, DEFAULT_ALPHABET) with mapped_signal_files.MappedSignalWriter(testfilepath, alphabet_info) as f: f.write_read(read_object.get_read_dictionary()) print("Current dir = ", os.getcwd()) print("File written to ", testfilepath) print("\nOpening file for reading") with mapped_signal_files.MappedSignalReader(testfilepath) as f: ids = f.get_read_ids() print("Read ids=", ids[0]) print("Version number = ", f.version) self.assertEqual(ids[0], read_dict['read_id']) file_test_report = f.check() print("Test report:", file_test_report) self.assertEqual(file_test_report, "pass") read_list = list(f.reads()) recovered_read = read_list[0] reflen = len(recovered_read.Reference) siglen = len(recovered_read.Dacs) # Get a chunk - note that chunkstart is relative to the start of # the mapped region, not relative to the start of the signal chunklen, chunkstart = 5, 3 chunk = recovered_read.get_chunk_with_sample_length( chunklen, chunkstart) # Check that the extracted chunk is the right length self.assertEqual(chunk.sig_len, chunklen) # Check that the mapping data agrees with what we put in self.assertTrue( np.all(recovered_read.Ref_to_signal == read_dict['Ref_to_signal'])) # Plot a picture showing ref_to_sig from the read object, # and the result of searches to find the inverse if False: plt.figure() plt.xlabel('Signal coord') plt.ylabel('Ref coord') ix = np.array([0, -1]) plt.scatter(chunk.current[ix], chunk.sequence[ix], s=50, label='chunk limits', marker='s', color='black') plt.scatter(recovered_read.Ref_to_signal, np.arange(reflen + 1), label='reftosig (source data)', color='none', edgecolor='blue', s=60) siglocs = np.arange(siglen, dtype=np.int32) sigtoref_fromsearch = recovered_read.get_reference_locations( siglocs) plt.scatter(siglocs, sigtoref_fromsearch, label='from search', color='red', marker='x', s=50) plt.legend() plt.grid() plt.savefig(self.plotfilepath) print("Saved plot to", self.plotfilepath)
def test_check_HDF5_mapped_read_file(self): """Check that constructing a read object which doesn't conform leads to errors. """ print("Creating Read object from test data") valid_read_dict = construct_mapped_read_dict() valid_read_object = signal_mapping.SignalMapping(**valid_read_dict) print("Checking contents") check_text = valid_read_object.check() print("Check result on valid read object: should pass") print(check_text) self.assertEqual(check_text, signal_mapping.SignalMapping.pass_str) print("Creating flawed Read object from test data") invalid_read_dict = construct_mapped_read_dict() # set reference to incorrect length invalid_read_dict['Reference'] = np.zeros( len(invalid_read_dict['Reference']) - 1, dtype=np.int32) invalid_read_object = signal_mapping.SignalMapping(**invalid_read_dict) print("Checking contents") check_text = invalid_read_object.check() print("Check result on invalid read object: should fail") print(check_text) self.assertNotEqual(check_text, signal_mapping.SignalMapping.pass_str) print("Writing invalid read to file") alphabet_info = alphabet.AlphabetInfo(DEFAULT_ALPHABET, DEFAULT_ALPHABET) with tempfile.NamedTemporaryFile(delete=True, dir=self.testset_work_dir) as fh: testfilepath = fh.name with mapped_signal_files.MappedSignalWriter(testfilepath, alphabet_info) as f: try: f.write_read(invalid_read_object.get_read_dictionary()) except signal_mapping.TaiyakiSigMapError: pass else: self.assertTrue(False, 'Invalid read passed checks.') print("Writing valid read to file") with tempfile.NamedTemporaryFile(delete=False, dir=self.testset_work_dir) as fh: testfilepath = fh.name with mapped_signal_files.MappedSignalWriter(testfilepath, alphabet_info) as f: try: f.write_read(valid_read_object.get_read_dictionary()) except signal_mapping.TaiyakiSigMapError: self.assertTrue(False, 'Valid read failed checks.') print("Current dir = ", os.getcwd()) print("File written to ", testfilepath) print("\nOpening valid file for reading") with mapped_signal_files.MappedSignalReader(testfilepath) as f: ids = f.get_read_ids() print("Read ids=", ids[0]) print("Version number = ", f.version) self.assertEqual(ids[0], valid_read_dict['read_id']) file_test_report = f.check() print("Test report (should pass):", file_test_report) self.assertEqual(file_test_report, signal_mapping.SignalMapping.pass_str)
def test_HDF5_mapped_read_file(self): """Test that we can save a mapped read file, open it again and use some methods to get data from it. Plot a picture for diagnostics. """ print("Creating Read object from test data") read_dict = construct_mapped_read() read_object = mapped_signal_files.Read(read_dict) print("Checking contents") check_text = read_object.check() print("Check result on read object:") print(check_text) self.assertEqual(check_text, "pass") print("Writing to file") alphabet_info = alphabet.AlphabetInfo(DEFAULT_ALPHABET, DEFAULT_ALPHABET) with mapped_signal_files.HDF5Writer(self.testfilepath, alphabet_info) as f: f.write_read(read_object) print("Current dir = ", os.getcwd()) print("File written to ", self.testfilepath) print("\nOpening file for reading") with mapped_signal_files.HDF5Reader(self.testfilepath) as f: ids = f.get_read_ids() print("Read ids=", ids[0]) print("Version number = ", f.version) self.assertEqual(ids[0], read_dict['read_id']) file_test_report = f.check() print("Test report:", file_test_report) self.assertEqual(file_test_report, "pass") read_list = f.get_multiple_reads("all") recovered_read = read_list[0] reflen = len(recovered_read['Reference']) siglen = len(recovered_read['Dacs']) # Get a chunk - note that chunkstart is relative to the start of the mapped # region, not relative to the start of the signal chunklen, chunkstart = 5, 3 chunkdict = recovered_read.get_chunk_with_sample_length(chunklen, chunkstart) # Check that the extracted chunk is the right length self.assertEqual(len(chunkdict['current']), chunklen) # Check that the mapping data agrees with what we put in self.assertTrue(np.all(recovered_read['Ref_to_signal']==read_dict['Ref_to_signal'])) # Plot a picture showing ref_to_sig from the read object, def setup(): # and the result of searches to find the inverse if False: plt.figure() plt.xlabel('Signal coord') plt.ylabel('Ref coord') ix = np.array([0, -1]) plt.scatter(chunkdict['current'][ix], chunkdict['sequence'][ix], s=50, label='chunk limits', marker='s', color='black') plt.scatter(recovered_read['Ref_to_signal'], np.arange(reflen + 1), label='reftosig (source data)', color='none', edgecolor='blue', s=60) siglocs = np.arange(siglen, dtype=np.int32) sigtoref_fromsearch = recovered_read.get_reference_locations(siglocs) plt.scatter(siglocs, sigtoref_fromsearch, label='from search', color='red', marker='x', s=50) plt.legend() plt.grid() plt.savefig(self.plotfilepath) print("Saved plot to", self.plotfilepath)
else: # Read sequences from .fa / .fasta file seq_dict = { int(seq.id): convert_seq(str(seq.seq), args.alphabet) for seq in SeqIO.parse(args.reference, "fasta") } log.write('* Loaded references from {}.\n'.format(args.reference)) # Write pickle for future pickle_name = os.path.splitext(args.reference)[0] + '.pkl' with open(pickle_name, 'wb') as fh: pickle.dump(seq_dict, fh) log.write(('* Written pickle of processed references to {} for ' + 'future use.\n').format(pickle_name)) log.write('* Reading network from {}\n'.format(args.model)) alphabet_info = alphabet.AlphabetInfo(args.alphabet, args.alphabet) model_kwargs = { 'size': args.size, 'stride': args.stride, 'winlen': args.winlen, # Number of input features to model e.g. was >1 for event-based models # (level, std, dwell) 'insize': 1, 'alphabet_info': alphabet_info } model_metadata = {'reverse': False, 'standardize': True} network = helpers.load_model(args.model, model_metadata=model_metadata, **model_kwargs).to(device) log.write('* Network has {} parameters.\n'.format(
def validate_and_merge_alphabets(in_fns): """ Validate that all alphabets are compatible. Alphabets can be incompatible if: 1) Same mod_base corresponds to different can_base 2) Same mod_base has different mod_long_names 3) Same mod_long_name has different mod_bases Return the merge_alphabet_info object Also check file versions so this this doesn't short circuit a longer run. """ all_alphabets = [] for in_fn in in_fns: with MappedSignalReader(in_fn) as msr: all_alphabets.append(msr.get_alphabet_information()) check_version(msr, in_fn) can_bases = all_alphabets[0].can_bases if not all((file_alphabet.can_bases == can_bases for file_alphabet in all_alphabets)): sys.stderr.write("All canonical alphabets must be the same for " + "--allow_mod_merge. Got: {}\n".format(', '.join( set(fa.can_bases for fa in all_alphabets)))) sys.exit(1) all_mods, mod_long_names, mod_fns = {}, {}, {} for in_fn, file_alphabet in zip(in_fns, all_alphabets): for mod_base in file_alphabet.mod_bases: can_base = mod_base.translate(file_alphabet.translation_table) mod_long_name = file_alphabet.mod_name_conv[mod_base] if mod_base in all_mods: # if this mod base has been seen assert that all other # attributes agree if all_mods[mod_base] != (can_base, mod_long_name): sys.stderr.write( ('Incompatible modified bases encountered:\n\t' + '{}={} (alt to {}) from {}\n\t' + '{}={} (alt to {}) from {}\n').format( mod_base, mod_long_name, can_base, in_fn, mod_base, all_mods[mod_base][1], all_mods[mod_base][0], mod_fns[mod_base])) sys.exit(1) else: # if the mod_base has not been seen before, the long name must # also be unique if mod_long_name in mod_long_names: sys.stderr.write( ('Incompatible modified bases encountered:\n\t' + '{}={} (alt to {}) from {}\n\t' + '{}={} (alt to {}) from {}\n').format( mod_base, mod_long_name, can_base, in_fn, mod_long_names[mod_long_name], mod_long_name, all_mods[mod_long_names[mod_long_name]][0], mod_fns[mod_long_names[mod_long_name]])) sys.exit(1) all_mods[mod_base] = (can_base, mod_long_name) mod_long_names[mod_long_name] = mod_base mod_fns[mod_base] = in_fn all_mods = [(mod_nase, can_b, mln) for mod_nase, (can_b, mln) in all_mods.items()] merge_alphabet = can_bases + ''.join(list(zip(*all_mods))[0]) merge_collapse_alphabet = can_bases + ''.join(list(zip(*all_mods))[1]) merge_mod_long_names = list(zip(*all_mods))[2] return alphabet.AlphabetInfo(merge_alphabet, merge_collapse_alphabet, merge_mod_long_names, do_reorder=True)