Example #1
0
    def testIsValidWithMetadata(self):
        # Create one valid DSV file
        content = """@title: test
#CHROM.POS.ID.REF.ALT.QUAL.FILTER
20.14370.rs6054257.G.A.29.PASS
20.17330.   ..A.3.q10 low

20.1110696.rs6040355.A.G,T.67.PASS
.1230237..T..47.
20.1234567.microsat1.GTC.G,GTCT.50.
......"""
        with open(self.tmp_out_dsv, "w") as FH_out:
            FH_out.write(content)
        # Check isValid
        self.assertTrue(not SVIO.isValid(self.tmp_out_dsv, ".", "@"))
        # Create one valid DSV file
        content = """@title: test
#CHROM.POS.ID.REF.ALT.QUAL.FILTER
20.14370.rs6054257.G.A.29.PASS
20.17330.   ..A.3.q10 low
20.1110696.rs6040355.A.G,T.67.PASS
.1230237..T..47.
20.1234567.microsat1.GTC.G,GTCT.50.
......"""
        with open(self.tmp_out_dsv, "w") as FH_out:
            FH_out.write(content)
        # Check isValid
        self.assertTrue(SVIO.isValid(self.tmp_out_dsv, ".", "@"))
        # Check if file pointer is ok in reopen
        observed_rows = []
        with open(self.tmp_out_dsv) as FH_in:
            for row_idx, readed_row in enumerate(FH_in):
                observed_rows.append(readed_row)
        self.assertEqual(content, "".join(observed_rows))
Example #2
0
 def testIterTSV(self):
     with SVIO(self.tmp_in_tsv, title_starter="#") as FH_in:
         # Header
         self.assertEqual(FH_in.titles, self.data["titles"])
         # Records
         for record_idx, readed_records in enumerate(FH_in):
             self.assertEqual(self.data["rows"][record_idx], readed_records)
Example #3
0
def loadFromDepthFile(in_path, samples):
    """
    Load depths classes and count from samtools depth output.

    :param in_path: Path to the samtools depth output.
    :type in_path: str
    :param samples: The list of samples names in order of depths columns.
    :type samples: list
    :return: The list of depths and by sample the list of counts.
    :rtype: list, dict
    """
    encountered_depths = dict()
    count_by_spl = dict()
    with SVIO(in_path, has_title=False) as FH_depths:
        count_by_spl = {curr_spl: dict() for curr_spl in samples}
        for record in FH_depths:  # record = [chr, pos, deph_spl_1, ..., depth_spl_n]
            for spl_idx, curr_spl in enumerate(samples):
                depth = int(record[spl_idx + 2])
                encountered_depths[depth] = 1
                if depth in count_by_spl[curr_spl]:
                    count_by_spl[curr_spl][depth] += 1
                else:
                    count_by_spl[curr_spl][depth] = 1
    depths_list = sorted([key for key in encountered_depths])
    for spl in samples:
        spl_counts = list()
        for depth in depths_list:
            if depth in count_by_spl[spl]:
                spl_counts.append(count_by_spl[spl][depth])
            else:
                spl_counts.append(0)
        count_by_spl[spl] = spl_counts
    return depths_list, count_by_spl
Example #4
0
 def testAppendDSV(self):
     split_limit = 3  # Numbers of row in first write
     # 1st write step
     with SVIO(self.tmp_out_dsv, "a", separator=".", title_starter="#") as FH_out:
         FH_out.titles = self.data["titles"]
         for row in self.data["rows"][:split_limit]:
             FH_out.write(row)
     # Assert result
     with open(self.tmp_out_dsv) as FH_in:
         nb_rows = -1
         for row_idx, readed_row in enumerate(FH_in):
             nb_rows += 1
             expected_row = ""
             if row_idx == 0:
                 expected_row = "#" + ".".join(self.data["titles"]) + "\n"
             else:
                 expected_row = ".".join(self.data["rows"][row_idx - 1]) + "\n"
             self.assertEqual(expected_row, readed_row)
         self.assertEqual(split_limit, nb_rows)
     # 2nd write step
     with SVIO(self.tmp_out_dsv, "a", separator=".", title_starter="#") as FH_out:
         self.assertEqual(self.data["titles"], FH_out.titles)  # Assert titles retrieval
         self.assertEqual(split_limit + 1, FH_out.current_line_nb)
         for row in self.data["rows"][split_limit:]:
             FH_out.write(row)
     # Assert result
     with open(self.tmp_out_dsv) as FH_in:
         nb_rows = -1
         for row_idx, readed_row in enumerate(FH_in):
             nb_rows += 1
             expected_row = ""
             if row_idx == 0:
                 expected_row = "#" + ".".join(self.data["titles"]) + "\n"
             else:
                 expected_row = ".".join(self.data["rows"][row_idx - 1]) + "\n"
             self.assertEqual(expected_row, readed_row)
         self.assertEqual(len(self.data["rows"]), nb_rows)
Example #5
0
 def testWriteDSV(self):
     # Write file
     with SVIO(self.tmp_out_dsv, "w", separator=".", title_starter="#") as FH_out:
         FH_out.titles = self.data["titles"]
         for row in self.data["rows"]:
             FH_out.write(row)
     # Assert result
     with open(self.tmp_out_dsv) as FH_in:
         for row_idx, readed_row in enumerate(FH_in):
             expected_row = ""
             if row_idx == 0:
                 expected_row = "#" + ".".join(self.data["titles"]) + "\n"
             else:
                 expected_row = ".".join(self.data["rows"][row_idx - 1]) + "\n"
             self.assertEqual(expected_row, readed_row)
Example #6
0
 def testAppendCompressedEmptyDSV(self):
     # Cretes empty file
     with gzip.open(self.tmp_out_dsv_gz, "wt") as FH_out:
         pass
     # Write file
     with SVIO(self.tmp_out_dsv_gz, "a", separator=".", title_starter="#") as FH_out:
         FH_out.titles = self.data["titles"]
         for row in self.data["rows"]:
             FH_out.write(row)
     # Assert result
     nb_rows = -1
     with gzip.open(self.tmp_out_dsv_gz, "rt") as FH_in:
         for row_idx, readed_row in enumerate(FH_in):
             nb_rows += 1
             expected_row = ""
             if row_idx == 0:
                 expected_row = "#" + ".".join(self.data["titles"]) + "\n"
             else:
                 expected_row = ".".join(self.data["rows"][row_idx - 1]) + "\n"
             self.assertEqual(expected_row, readed_row)
     self.assertEqual(len(self.data["rows"]), nb_rows)
Example #7
0
 def testIsValidTrue(self):
     # Create one valid DSV file
     rows = deepcopy(self.data["rows"])
     rows.insert(0, self.data["titles"])
     rows.extend([
         ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""],
         ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""],
         ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""],
         ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""],
         ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""],
         ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""],
         ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""],
         ["1", "1234567", "microsat1", "GTC", "G,GTCT", "50", ""]
     ])
     with open(self.tmp_out_dsv, "w") as FH_out:
         for curr_row in rows:
             FH_out.write(".".join(curr_row) + "\n")
     # Check isValid
     self.assertEqual(True, SVIO.isValid(self.tmp_out_dsv, "."))
     # Check if fie pointer is ok in reopen
     with open(self.tmp_out_dsv) as FH_in:
         for row_idx, readed_row in enumerate(FH_in):
             expected_row = ".".join(rows[row_idx]) + "\n"
             self.assertEqual(expected_row, readed_row)
        help='Path to saturation file (format: separated values).')
    args = parser.parse_args()

    # Logger
    logging.basicConfig(
        format=
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s'
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Get samples
    samples = args.samples_names
    if args.samples_names is None:
        with SVIO(args.input_counts, has_title=True) as reader:
            samples = reader.titles[1:]

    # Process
    random.seed(args.random_seed)
    with SVIO(args.output_saturation, "w",
              separator=args.fields_separator) as writer:
        writer.titles = ["sample", "nb_sampled", "nb_unique"]
        for spl_idx, spl_name in enumerate(samples):
            log.info("Process sample {}".format(spl_name))
            with SVIO(args.input_counts,
                      separator=args.fields_separator,
                      has_title=(args.samples_names is None)) as reader:
                # Get counts
                total_count = 0
                curr_idx = 0
Example #9
0
 def testWriteHeader(self):
     # With titles and metadata
     with SVIO(self.tmp_out_tsv, "w") as writer:
         writer.titles = self.data["titles"]
         writer.metadata = self.data["metadata"]
         writer.writeHeader()
         writer.write(self.data["rows"][0])
     with SVIO(self.tmp_out_tsv, "r") as reader:
         self.assertEqual(reader.titles, self.data["titles"])
         self.assertEqual(reader.metadata, self.data["metadata"])
         self.assertEqual(reader.read(), [self.data["rows"][0]])
     # Without metadata
     with SVIO(self.tmp_out_tsv, "w") as writer:
         writer.titles = self.data["titles"]
         writer.writeHeader()
         writer.write(self.data["rows"][0])
     with SVIO(self.tmp_out_tsv, "r") as reader:
         self.assertEqual(reader.titles, self.data["titles"])
         self.assertEqual(reader.metadata, [])
         self.assertEqual(reader.read(), [self.data["rows"][0]])
     # Without titles
     with SVIO(self.tmp_out_tsv, "w", has_title=False) as writer:
         writer.metadata = self.data["metadata"]
         writer.writeHeader()
         writer.write(self.data["rows"][0])
     with SVIO(self.tmp_out_tsv, "r", has_title=False) as reader:
         self.assertEqual(reader.titles, None)
         self.assertEqual(reader.metadata, self.data["metadata"])
         self.assertEqual(reader.read(), [self.data["rows"][0]])
     # Without titles and metadata
     with SVIO(self.tmp_out_tsv, "w", has_title=False) as writer:
         writer.writeHeader()
         writer.write(self.data["rows"][0])
     with SVIO(self.tmp_out_tsv, "r", has_title=False) as reader:
         self.assertEqual(reader.titles, None)
         self.assertEqual(reader.metadata, [])
         self.assertEqual(reader.read(), [self.data["rows"][0]])
     # Only header with titles and metadata
     with SVIO(self.tmp_out_tsv, "w") as writer:
         writer.titles = self.data["titles"]
         writer.metadata = self.data["metadata"]
         writer.writeHeader()
     with SVIO(self.tmp_out_tsv, "r") as reader:
         self.assertEqual(reader.titles, self.data["titles"])
         self.assertEqual(reader.metadata, self.data["metadata"])
     # Only header without metadata
     with SVIO(self.tmp_out_tsv, "w") as writer:
         writer.titles = self.data["titles"]
         writer.writeHeader()
     with SVIO(self.tmp_out_tsv, "r") as reader:
         self.assertEqual(reader.titles, self.data["titles"])
         self.assertEqual(reader.metadata, [])
     # Only header without titles
     with SVIO(self.tmp_out_tsv, "w", has_title=False) as writer:
         writer.metadata = self.data["metadata"]
         writer.writeHeader()
     with SVIO(self.tmp_out_tsv, "r", has_title=False) as reader:
         self.assertEqual(reader.titles, None)
         self.assertEqual(reader.metadata, self.data["metadata"])
     # Empty without titles and metadata
     with SVIO(self.tmp_out_tsv, "w", has_title=False) as writer:
         writer.writeHeader()
     with SVIO(self.tmp_out_tsv, "r", has_title=False) as reader:
         self.assertEqual(reader.titles, None)
         self.assertEqual(reader.metadata, [])
Example #10
0
     args.work_folder, "out_dataset-{}".format(dataset_id))
 # Process tag
 with open(app_config_bck) as FH_in:
     with open(app_config, "w") as FH_out:
         for line in FH_in:
             FH_out.write(
                 line.replace(
                     "limit_submission = 100",
                     "limit_submission = " + str(nb_jobs)))
 start_time = time.time()
 predict(test_samples, design_folder, baseline_path,
         models_path, out_folder, args)
 end_time = time.time()
 # Write results and dataset
 out_mode = "w" if is_first else "a"
 with SVIO(args.output_metrics, out_mode) as FH_out:
     FH_out.titles = [
         "datatset_id", "nb_loci", "nb_spl", "median_nb_nt",
         "sum_nb_nt", "median_nb_reads", "sum_nb_reads",
         "nb_jobs", "exec_time"
     ]
     FH_out.write([
         dataset_id,
         len(loci), nb_spl,
         median([spl["nb_nt"] for spl in test_samples]),
         sum([spl["nb_nt"] for spl in test_samples]),
         median([spl["nb_reads"] for spl in test_samples]),
         sum([spl["nb_reads"] for spl in test_samples]),
         nb_jobs, (end_time - start_time) / 60
     ])
 is_first = False
Example #11
0
        'chr13': '13',
        'chr14': '14',
        'chr15': '15',
        'chr16': '16',
        'chr17': '17',
        'chr18': '18',
        'chr19': '19',
        'chr20': '20',
        'chr21': '21',
        'chr22': '22',
        'chrX': 'X',
        'chrY': 'Y',
        'chrM': 'MT'
    }
    if args.input_names:
        with SVIO(args.input_names, "r", separator="\t",
                  has_title=False) as reader:
            for record in reader:
                new_names[record[0]] = record[1]

    # Process
    with VCFIO(args.output_variants, "w") as writer:
        with VCFIO(args.input_variants, "r") as reader:
            # Header
            writer.copyHeader(reader)
            for idx, curr_header in enumerate(writer.extra_header):
                if curr_header.startswith("##contig"):
                    content = uGetHeaderAttr(curr_header)
                    old_id = content.id
                    if content.id in new_names:
                        new_id = new_names[old_id]
                        writer.extra_header[idx] = curr_header.replace(