def testIsValidWithMetadata(self): # Create one valid DSV file content = """@title: test #CHROM.POS.ID.REF.ALT.QUAL.FILTER 20.14370.rs6054257.G.A.29.PASS 20.17330. ..A.3.q10 low 20.1110696.rs6040355.A.G,T.67.PASS .1230237..T..47. 20.1234567.microsat1.GTC.G,GTCT.50. ......""" with open(self.tmp_out_dsv, "w") as FH_out: FH_out.write(content) # Check isValid self.assertTrue(not SVIO.isValid(self.tmp_out_dsv, ".", "@")) # Create one valid DSV file content = """@title: test #CHROM.POS.ID.REF.ALT.QUAL.FILTER 20.14370.rs6054257.G.A.29.PASS 20.17330. ..A.3.q10 low 20.1110696.rs6040355.A.G,T.67.PASS .1230237..T..47. 20.1234567.microsat1.GTC.G,GTCT.50. ......""" with open(self.tmp_out_dsv, "w") as FH_out: FH_out.write(content) # Check isValid self.assertTrue(SVIO.isValid(self.tmp_out_dsv, ".", "@")) # Check if file pointer is ok in reopen observed_rows = [] with open(self.tmp_out_dsv) as FH_in: for row_idx, readed_row in enumerate(FH_in): observed_rows.append(readed_row) self.assertEqual(content, "".join(observed_rows))
def testIterTSV(self): with SVIO(self.tmp_in_tsv, title_starter="#") as FH_in: # Header self.assertEqual(FH_in.titles, self.data["titles"]) # Records for record_idx, readed_records in enumerate(FH_in): self.assertEqual(self.data["rows"][record_idx], readed_records)
def loadFromDepthFile(in_path, samples): """ Load depths classes and count from samtools depth output. :param in_path: Path to the samtools depth output. :type in_path: str :param samples: The list of samples names in order of depths columns. :type samples: list :return: The list of depths and by sample the list of counts. :rtype: list, dict """ encountered_depths = dict() count_by_spl = dict() with SVIO(in_path, has_title=False) as FH_depths: count_by_spl = {curr_spl: dict() for curr_spl in samples} for record in FH_depths: # record = [chr, pos, deph_spl_1, ..., depth_spl_n] for spl_idx, curr_spl in enumerate(samples): depth = int(record[spl_idx + 2]) encountered_depths[depth] = 1 if depth in count_by_spl[curr_spl]: count_by_spl[curr_spl][depth] += 1 else: count_by_spl[curr_spl][depth] = 1 depths_list = sorted([key for key in encountered_depths]) for spl in samples: spl_counts = list() for depth in depths_list: if depth in count_by_spl[spl]: spl_counts.append(count_by_spl[spl][depth]) else: spl_counts.append(0) count_by_spl[spl] = spl_counts return depths_list, count_by_spl
def testAppendDSV(self): split_limit = 3 # Numbers of row in first write # 1st write step with SVIO(self.tmp_out_dsv, "a", separator=".", title_starter="#") as FH_out: FH_out.titles = self.data["titles"] for row in self.data["rows"][:split_limit]: FH_out.write(row) # Assert result with open(self.tmp_out_dsv) as FH_in: nb_rows = -1 for row_idx, readed_row in enumerate(FH_in): nb_rows += 1 expected_row = "" if row_idx == 0: expected_row = "#" + ".".join(self.data["titles"]) + "\n" else: expected_row = ".".join(self.data["rows"][row_idx - 1]) + "\n" self.assertEqual(expected_row, readed_row) self.assertEqual(split_limit, nb_rows) # 2nd write step with SVIO(self.tmp_out_dsv, "a", separator=".", title_starter="#") as FH_out: self.assertEqual(self.data["titles"], FH_out.titles) # Assert titles retrieval self.assertEqual(split_limit + 1, FH_out.current_line_nb) for row in self.data["rows"][split_limit:]: FH_out.write(row) # Assert result with open(self.tmp_out_dsv) as FH_in: nb_rows = -1 for row_idx, readed_row in enumerate(FH_in): nb_rows += 1 expected_row = "" if row_idx == 0: expected_row = "#" + ".".join(self.data["titles"]) + "\n" else: expected_row = ".".join(self.data["rows"][row_idx - 1]) + "\n" self.assertEqual(expected_row, readed_row) self.assertEqual(len(self.data["rows"]), nb_rows)
def testWriteDSV(self): # Write file with SVIO(self.tmp_out_dsv, "w", separator=".", title_starter="#") as FH_out: FH_out.titles = self.data["titles"] for row in self.data["rows"]: FH_out.write(row) # Assert result with open(self.tmp_out_dsv) as FH_in: for row_idx, readed_row in enumerate(FH_in): expected_row = "" if row_idx == 0: expected_row = "#" + ".".join(self.data["titles"]) + "\n" else: expected_row = ".".join(self.data["rows"][row_idx - 1]) + "\n" self.assertEqual(expected_row, readed_row)
def testAppendCompressedEmptyDSV(self): # Cretes empty file with gzip.open(self.tmp_out_dsv_gz, "wt") as FH_out: pass # Write file with SVIO(self.tmp_out_dsv_gz, "a", separator=".", title_starter="#") as FH_out: FH_out.titles = self.data["titles"] for row in self.data["rows"]: FH_out.write(row) # Assert result nb_rows = -1 with gzip.open(self.tmp_out_dsv_gz, "rt") as FH_in: for row_idx, readed_row in enumerate(FH_in): nb_rows += 1 expected_row = "" if row_idx == 0: expected_row = "#" + ".".join(self.data["titles"]) + "\n" else: expected_row = ".".join(self.data["rows"][row_idx - 1]) + "\n" self.assertEqual(expected_row, readed_row) self.assertEqual(len(self.data["rows"]), nb_rows)
def testIsValidTrue(self): # Create one valid DSV file rows = deepcopy(self.data["rows"]) rows.insert(0, self.data["titles"]) rows.extend([ ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""], ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""], ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""], ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""], ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""], ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""], ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""], ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""] ]) with open(self.tmp_out_dsv, "w") as FH_out: for curr_row in rows: FH_out.write(".".join(curr_row) + "\n") # Check isValid self.assertEqual(True, SVIO.isValid(self.tmp_out_dsv, ".")) # Check if fie pointer is ok in reopen with open(self.tmp_out_dsv) as FH_in: for row_idx, readed_row in enumerate(FH_in): expected_row = ".".join(rows[row_idx]) + "\n" self.assertEqual(expected_row, readed_row)
help='Path to saturation file (format: separated values).') args = parser.parse_args() # Logger logging.basicConfig( format= '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Get samples samples = args.samples_names if args.samples_names is None: with SVIO(args.input_counts, has_title=True) as reader: samples = reader.titles[1:] # Process random.seed(args.random_seed) with SVIO(args.output_saturation, "w", separator=args.fields_separator) as writer: writer.titles = ["sample", "nb_sampled", "nb_unique"] for spl_idx, spl_name in enumerate(samples): log.info("Process sample {}".format(spl_name)) with SVIO(args.input_counts, separator=args.fields_separator, has_title=(args.samples_names is None)) as reader: # Get counts total_count = 0 curr_idx = 0
def testWriteHeader(self): # With titles and metadata with SVIO(self.tmp_out_tsv, "w") as writer: writer.titles = self.data["titles"] writer.metadata = self.data["metadata"] writer.writeHeader() writer.write(self.data["rows"][0]) with SVIO(self.tmp_out_tsv, "r") as reader: self.assertEqual(reader.titles, self.data["titles"]) self.assertEqual(reader.metadata, self.data["metadata"]) self.assertEqual(reader.read(), [self.data["rows"][0]]) # Without metadata with SVIO(self.tmp_out_tsv, "w") as writer: writer.titles = self.data["titles"] writer.writeHeader() writer.write(self.data["rows"][0]) with SVIO(self.tmp_out_tsv, "r") as reader: self.assertEqual(reader.titles, self.data["titles"]) self.assertEqual(reader.metadata, []) self.assertEqual(reader.read(), [self.data["rows"][0]]) # Without titles with SVIO(self.tmp_out_tsv, "w", has_title=False) as writer: writer.metadata = self.data["metadata"] writer.writeHeader() writer.write(self.data["rows"][0]) with SVIO(self.tmp_out_tsv, "r", has_title=False) as reader: self.assertEqual(reader.titles, None) self.assertEqual(reader.metadata, self.data["metadata"]) self.assertEqual(reader.read(), [self.data["rows"][0]]) # Without titles and metadata with SVIO(self.tmp_out_tsv, "w", has_title=False) as writer: writer.writeHeader() writer.write(self.data["rows"][0]) with SVIO(self.tmp_out_tsv, "r", has_title=False) as reader: self.assertEqual(reader.titles, None) self.assertEqual(reader.metadata, []) self.assertEqual(reader.read(), [self.data["rows"][0]]) # Only header with titles and metadata with SVIO(self.tmp_out_tsv, "w") as writer: writer.titles = self.data["titles"] writer.metadata = self.data["metadata"] writer.writeHeader() with SVIO(self.tmp_out_tsv, "r") as reader: self.assertEqual(reader.titles, self.data["titles"]) self.assertEqual(reader.metadata, self.data["metadata"]) # Only header without metadata with SVIO(self.tmp_out_tsv, "w") as writer: writer.titles = self.data["titles"] writer.writeHeader() with SVIO(self.tmp_out_tsv, "r") as reader: self.assertEqual(reader.titles, self.data["titles"]) self.assertEqual(reader.metadata, []) # Only header without titles with SVIO(self.tmp_out_tsv, "w", has_title=False) as writer: writer.metadata = self.data["metadata"] writer.writeHeader() with SVIO(self.tmp_out_tsv, "r", has_title=False) as reader: self.assertEqual(reader.titles, None) self.assertEqual(reader.metadata, self.data["metadata"]) # Empty without titles and metadata with SVIO(self.tmp_out_tsv, "w", has_title=False) as writer: writer.writeHeader() with SVIO(self.tmp_out_tsv, "r", has_title=False) as reader: self.assertEqual(reader.titles, None) self.assertEqual(reader.metadata, [])
args.work_folder, "out_dataset-{}".format(dataset_id)) # Process tag with open(app_config_bck) as FH_in: with open(app_config, "w") as FH_out: for line in FH_in: FH_out.write( line.replace( "limit_submission = 100", "limit_submission = " + str(nb_jobs))) start_time = time.time() predict(test_samples, design_folder, baseline_path, models_path, out_folder, args) end_time = time.time() # Write results and dataset out_mode = "w" if is_first else "a" with SVIO(args.output_metrics, out_mode) as FH_out: FH_out.titles = [ "datatset_id", "nb_loci", "nb_spl", "median_nb_nt", "sum_nb_nt", "median_nb_reads", "sum_nb_reads", "nb_jobs", "exec_time" ] FH_out.write([ dataset_id, len(loci), nb_spl, median([spl["nb_nt"] for spl in test_samples]), sum([spl["nb_nt"] for spl in test_samples]), median([spl["nb_reads"] for spl in test_samples]), sum([spl["nb_reads"] for spl in test_samples]), nb_jobs, (end_time - start_time) / 60 ]) is_first = False
'chr13': '13', 'chr14': '14', 'chr15': '15', 'chr16': '16', 'chr17': '17', 'chr18': '18', 'chr19': '19', 'chr20': '20', 'chr21': '21', 'chr22': '22', 'chrX': 'X', 'chrY': 'Y', 'chrM': 'MT' } if args.input_names: with SVIO(args.input_names, "r", separator="\t", has_title=False) as reader: for record in reader: new_names[record[0]] = record[1] # Process with VCFIO(args.output_variants, "w") as writer: with VCFIO(args.input_variants, "r") as reader: # Header writer.copyHeader(reader) for idx, curr_header in enumerate(writer.extra_header): if curr_header.startswith("##contig"): content = uGetHeaderAttr(curr_header) old_id = content.id if content.id in new_names: new_id = new_names[old_id] writer.extra_header[idx] = curr_header.replace(