def flexible_hic_saver(mat, out_prefix, frags=None, chroms=None, hic_fmt="graal"): """ Saves objects to the desired Hi-C file format. Parameters ---------- mat : scipy.sparse.coo_matrix frags : pandas.DataFrame or None Table of fragments informations. chroms : pandas.DataFrame or None Table of chromosomes / contigs informations. hic_fmt : str Output format. Can be one of graal for graal-compatible COO format, bg2 for 2D bedgraph format, or cool for cooler compatible format. """ if hic_fmt == "graal": save_sparse_matrix(mat, out_prefix + ".mat.tsv") try: frags.to_csv(out_prefix + ".frag.tsv", sep="\t", index=False) except AttributeError: logger.warning( "Could not create fragments_list.txt from input files") try: chroms.to_csv(out_prefix + ".chr.tsv", sep="\t", index=False) except AttributeError: logger.warning( "Could not create info_contigs.txt from input files") elif hic_fmt == "cool": frag_sizes = frags.end_pos - frags.start_pos size_mad = np.median(frag_sizes - np.median(frag_sizes)) bin_type = 'variable' if size_mad else 'fixed' try: save_cool(out_prefix + ".cool", mat, frags, metadata={ "hicstuff": __version__, 'bin-type': bin_type }) except NameError: NameError("frags is required to save a cool file") elif hic_fmt == "bg2": try: save_bedgraph2d(mat, frags, out_prefix + ".bg2") except NameError: NameError("frags is required to save a bg2 file") else: raise ValueError("Unknown output format: {0}".format(hic_fmt))
def normalize_distance_law(xs, ps, inf=3000, sup=None): """Normalize the distance in order to have the sum of the ps values between 'inf' (default value is 3kb) until the end of the array equal to one and limit the effect of coverage between two conditions/chromosomes/arms when you compare them together. If we have a list of ps, it will normalize until the length of the shorter object or the value of sup, whichever is smaller. Parameters ---------- xs : list of numpy.ndarray list of logbins corresponding to the ps. ps : list of numpy.ndarray Average ps or list of ps of the chromosomes/arms. xs and ps have to have the same shape. inf : integer Inferior value of the interval on which, the normalization is applied. sup : integer Superior value of the interval on which, the normalization is applied. Returns ------- list of numpy.ndarray : List of ps each normalized separately. """ # Sanity check: xs and ps have the same dimension if np.shape(xs) != np.shape(ps): logger.error("xs and ps should have the same dimension.") sys.exit(1) # Define the length of shortest chromosomes as a lower bound for the sup boundary min_xs = len(min(xs, key=len)) normed_ps = [None] * len(ps) if sup is None: sup = np.inf for chrom_id, chrom_ps in enumerate(ps): # Iterate on the different ps to normalize each of theme separately chrom_sum = 0 # Change the last value to have something continuous because the last # one is much bigger (computed on matrix corner = triangle instead of trapezoid). chrom_ps[-1] = chrom_ps[-2] for bin_id, bin_value in enumerate(chrom_ps): # Compute normalization factor based on values between inf and sup # Sup will be whatever is smaller between user-provided sup and length of # the shortest chromosome if (xs[chrom_id][bin_id] > inf) and (xs[chrom_id][bin_id] < sup) and (bin_id < min_xs): chrom_sum += bin_value if chrom_sum == 0: chrom_sum += 1 logger.warning("No values of p(s) in one segment") # Make the normalisation normed_ps[chrom_id] = np.array(ps[chrom_id]) / chrom_sum return normed_ps
def normalize_distance_law(xs, ps, inf=3000): """Normalize the distance in order to have the sum of the ps values between 'inf' (default value is 3kb) until the end of the array equal to one and limit the effect of coverage between two conditions/chromosomes/arms when you compare them together. If we have a list of ps, it will normalize until the length of the shorter object. Parameters ---------- xs : list of numpy.ndarray list of logbins corresponding to the ps. ps : list of numpy.ndarray Average ps or list of ps of the chromosomes/arms. xs and ps have to have the same shape. inf : integer Inferior value of the intervall on which, the normalization is making. Returns ------- list of numpy.ndarray : List of ps each normalized separately. """ # Sanity check: xs and ps have the same dimension if np.shape(xs) != np.shape(ps): logger.error("xs and ps should have the same dimension.") sys.exit(1) # Take the min of xs as superior limit to choose the limits of the # interval use for the normalisation min_xs = len(min(xs, key=len)) normed_ps = [None] * len(ps) for j, my_list in enumerate(ps): # Iterate on the different ps to normalize each of theme separately sum_values = 0 # Change the last value to have something continuous because the last # one is much bigger. my_list[-1] = my_list[-2] for i, value in enumerate(my_list): # Keep only the value between 1kb and the length of the shorter # object given in the list if (xs[j][i] > inf) and (i < min_xs): sum_values += value if sum_values == 0: sum_values += 1 logger.warning("No values of p(s) in one segment") # Make the normalisation normed_ps[j] = np.array(ps[j]) / sum_values return normed_ps
def to_dade_matrix(M, annotations="", filename=None): """Returns a Dade matrix from input numpy matrix. Any annotations are added as header. If filename is provided and valid, said matrix is also saved as text. """ n, m = M.shape A = np.zeros((n + 1, m + 1)) A[1:, 1:] = M if not annotations: annotations = np.array(["" for _ in n], dtype=str) A[0, :] = annotations A[:, 0] = annotations.T if filename: try: np.savetxt(filename, A, fmt="%i") logger.info("I saved input matrix in dade format as {0}".format( str(filename))) except ValueError as e: logger.warning("I couldn't save input matrix.") logger.warning(str(e)) return A
def get_chr_segment_bins_index(fragments, centro_file=None, rm_centro=0): """Get the index positions of the start and end bins of different chromosomes, or arms if the centromers position have been given from the fragments file made by hicstuff. Parameters ---------- fragments : pandas.DataFrame Table containing in the first coulum the ID of the fragment, in the second the names of the chromosome in the third and fourth the start position and the end position of the fragment. The file have no header. (File like the 'fragments_list.txt' from hicstuff) centro_file : None or str None or path to a file with the genomic positions of the centromers sorted as the chromosomes separated by a space. The file have only one line. rm_centro : int If a value is given, will remove the contacts close the centromeres. It will remove as many kb as the argument given. Default is zero. Returns ------- list of floats : The start and end indices of chromosomes/arms to compute the distance law on each chromosome/arm separately. """ # Get bins where chromosomes start chr_start_bins = np.where(fragments == 0)[0] # Create a list of same length for the end of the bins chr_end_bins = np.zeros(len(chr_start_bins)) # Get bins where chromsomes end for i in range(len(chr_start_bins) - 1): chr_end_bins[i] = chr_start_bins[i + 1] chr_end_bins[-1] = len(fragments.iloc[:, 0]) # Combine start and end of bins in a single array. Values are the id of the # bins chr_segment_bins = np.sort(np.concatenate((chr_start_bins, chr_end_bins))) if centro_file is not None: # Read the file of the centromers with open(centro_file, "r", newline="") as centro: centro = csv.reader(centro, delimiter=" ") centro_pos = next(centro) # Sanity check: as many chroms as centromeres if len(chr_start_bins) != len(centro_pos): logger.warning( "Number of chromosomes and centromeres differ, centromeres position are not taking into account." ) centro_file = None if centro_file is not None: # Get bins of centromeres centro_bins = np.zeros(2 * len(centro_pos)) for i in range(len(chr_start_bins)): if (i + 1) < len(chr_start_bins): subfrags = fragments[chr_start_bins[i]:chr_start_bins[i + 1]] else: subfrags = fragments[chr_start_bins[i]:] # index of last fragment starting before centro in same chrom centro_bins[2 * i] = chr_start_bins[i] + max( np.where(subfrags["start_pos"][:] // (int(centro_pos[i]) - rm_centro) == 0)[0]) centro_bins[2 * i + 1] = chr_start_bins[i] + max( np.where(subfrags["start_pos"][:] // (int(centro_pos[i]) + rm_centro) == 0)[0]) # Combine centro and chrom bins into a single array. Values are the id # of the bins started and ending the arms. chr_segment_bins = np.sort( np.concatenate((chr_start_bins, chr_end_bins, centro_bins))) return list(chr_segment_bins)
def flexible_hic_loader(mat, fragments_file=None, chroms_file=None, quiet=False): """ Wrapper function to load COO, bg2 or cool input and return the same output. COO formats requires fragments_file and chroms_file options. bg2 format can infer bin_size if fixed. When providing a bg2 matrix with uneven fragments length, one should provide fragments_file as well or empty bins will be truncated from the output. Parameters ---------- mat : str Path to the matrix in graal, bedgraph2 or cool format. fragments_file : str or None Path to the file with fragments information (fragments_list.txt). Only required if the matrix is in graal format. chroms_file : str or None Path to the file with chromosome information (info_contigs.txt). Only required if the matrix is in graal format. quiet : bool If True, will silence warnings for empty outputs. Returns ------- mat : scipy.sparse.coo_matrix Sparse upper triangle Hi-C matrix. frags : pandas.DataFrame or None Table of fragment informations. None if information was not provided. chroms : pandas.DataFrame or None Table of chromosomes/contig information. None if information was not provided. """ hic_format = get_hic_format(mat) # Load cool based on file extension if hic_format == "cool": mat, frags, chroms = load_cool(mat) # Use the first line to determine COO / bg2 format if hic_format == "bg2": # Use the frags file to define bins if available if fragments_file is not None: mat, frags, chroms = load_bedgraph2d(mat, fragments_file=fragments_file) else: # Guess if bin size is fixed based on MAD bg2 = pd.read_csv(mat, sep="\t") sizes = np.array(bg2.iloc[:, 2] - bg2.iloc[:, 1]) size_mad = ss.median_abs_deviation(sizes, scale='normal') # Use only the bg2 if size_mad > 0: mat, frags, chroms = load_bedgraph2d(mat) logger.warning( "Input is a bedgraph2d file with uneven bin size, " "but no fragments_file was provided. Empty bins will " "be missing from the output. To avoid this, provide a " "fragments file.") # Use fixed bin size else: mat, frags, chroms = load_bedgraph2d(mat, bin_size=int( np.median(sizes))) elif hic_format == "graal": mat = load_sparse_matrix(mat) try: frags = pd.read_csv(fragments_file, sep="\t") except ValueError: if not quiet: logger.warning( "fragments_file was not provided when " "loading a matrix in COO/graal format. frags will be None." ) frags = None try: chroms = pd.read_csv(chroms_file, sep="\t") except ValueError: if not quiet: logger.warning( "chroms_file was not provided when " "loading a matrix in COO/graal format. chroms will be None." ) chroms = None # Ensure the matrix is upper triangle symmetric if mat.shape[0] == mat.shape[1]: if (abs(mat - mat.T) > 1e-10).nnz > 0: mat = mat + tril(mat, k=-1).T mat = triu(mat, format="coo") return mat, frags, chroms
def load_bedgraph2d(filename, bin_size=None, fragments_file=None): """ Loads matrix and fragment information from a 2D bedgraph file. Note this function assumes chromosomes are ordered in alphabetical. order Parameters ---------- filename : str Path to the bedgraph2D file. bin_size : int The size of bins in the case of fixed bin size. fragments_file : str Path to a fragments file to explicitely provide fragments positions. If the matrix does not have fixed bin size, this prevents errors. Returns ------- mat : scipy.sparse.coo_matrix The Hi-C contact map as the upper triangle of a symetric matrix, in sparse format. frags : pandas.DataFrame The list of fragments/bin present in the matrix with their genomic positions. """ bed2d = pd.read_csv(filename, sep="\t", header=None) chrom_sizes = {} if bin_size is not None: # If bin size if provided, retrieve chromosome lengths, this will be # used when regenerating bin coordinates chroms_left = bed2d[[3, 5]] chroms_left.columns = [0, 2] chroms = (pd.concat([bed2d[[0, 2]], chroms_left]).groupby([0], sort=False).max()) for chrom, size in zip(chroms.index, np.array(chroms)): chrom_sizes[chrom] = size[0] elif fragments_file is None: logger.warning( "Please be aware that not all information can be restored from a " "bg2 file without fixed bin size; fragments without any contact " "will be lost") # Get all possible fragment chrom-positions into an array frag_pos = np.vstack( [np.array(bed2d[[0, 1, 2]]), np.array(bed2d[[3, 4, 5]])]) # Sort by position (least important, col 1) frag_pos = frag_pos[frag_pos[:, 1].argsort(kind="mergesort")] # Then by chrom (most important, col 0) frag_pos = frag_pos[frag_pos[:, 0].argsort(kind="mergesort")] # Get unique names for fragments (chrom+pos) ordered_frag_pos = (pd.DataFrame(frag_pos).drop_duplicates().reset_index( drop=True)) frag_pos_a = bed2d[[0, 1]].apply(lambda x: tuple(x), axis=1) frag_pos_b = bed2d[[3, 4]].apply(lambda x: tuple(x), axis=1) # If fragments file is provided, use fragments positions to indices mapping if fragments_file is not None: frags = pd.read_csv(fragments_file, delimiter="\t") frag_map = frags.apply(lambda x: (str(x.chrom), x.start_pos), axis=1) frag_map = {f_name: f_idx for f_idx, f_name in enumerate(frag_map)} # If fixed fragment size available, use it to reconstruct original # fragments ID (even if they are absent from the bedgraph file). elif bin_size is not None: frag_map = {} chrom_frags = [] for chrom, size in chrom_sizes.items(): prev_frags = len(frag_map) for bin_id, bin_pos in enumerate(range(0, size, bin_size)): frag_map[(chrom, bin_pos)] = bin_id + prev_frags n_bins = size // bin_size chrom_frags.append( pd.DataFrame({ "id": range(1, n_bins + 1), "chrom": np.repeat(chrom, n_bins), "start_pos": range(0, size, bin_size), "end_pos": range(bin_size, size + bin_size, bin_size), })) frags = pd.concat(chrom_frags, axis=0).reset_index(drop=True) frags.insert(loc=3, column="size", value=frags.end_pos - frags.start_pos) # If None available, guess fragments indices from bedgraph (potentially wrong) else: frag_map = {(v[0], v[1]): i for i, v in ordered_frag_pos.iloc[:, [0, 1]].iterrows()} frags = ordered_frag_pos.copy() frags[3] = frags.iloc[:, 2] - frags.iloc[:, 1] frags.insert(loc=0, column="id", value=0) frags.id = frags.groupby([0], sort=False).cumcount() + 1 frags.columns = ["id", "chrom", "start_pos", "end_pos", "size"] # Match bin indices to their names frag_id_a = np.array(list(map(lambda x: frag_map[x], frag_pos_a))) frag_id_b = np.array(list(map(lambda x: frag_map[x], frag_pos_b))) contacts = np.array(bed2d.iloc[:, 6].tolist()) # Use index to build matrix n_frags = len(frag_map.keys()) mat = coo_matrix((contacts, (frag_id_a, frag_id_b)), shape=(n_frags, n_frags)) # Get size of each chromosome in basepairs chromsizes = frags.groupby( "chrom", sort=False).apply(lambda x: np.int64(max(x.end_pos))) chrom_bins = frags.groupby("chrom", sort=False).size() # Shift chromsizes by one to get starting bin, first one is zero # Make chromsize cumulative to get start bin of each chrom # Get chroms into a 1D array of bin starts chrom_start = chrom_bins.shift(1, fill_value=0).cumsum() chroms = pd.DataFrame({ "contig": chromsizes.index, "length": chromsizes.values, "n_frags": chrom_bins, "cumul_length": chrom_start, }) return mat, frags, chroms
def dade_to_graal( filename, output_matrix=DEFAULT_SPARSE_MATRIX_FILE_NAME, output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, output_frags=DEFAULT_SPARSE_MATRIX_FILE_NAME, output_dir=None, ): """Convert a matrix from DADE format (https://github.com/scovit/dade) to a graal-compatible format. Since DADE matrices contain both fragment and contact information all files are generated at the same time. """ with open(output_matrix, "w") as sparse_file: sparse_file.write("id_frag_a\tid_frag_b\tn_contact") with open(filename) as file_handle: first_line = file_handle.readline() for row_index, line in enumerate(file_handle): dense_row = np.array(line.split("\t")[1:], dtype=np.int32) for col_index in np.nonzero(dense_row)[0]: line_to_write = "{}\t{}\t{}\n".format( row_index, col_index, dense_row[col_index]) sparse_file.write(line_to_write) header = first_line.split("\t") bin_type = header[0] if bin_type == '"RST"': logger.info("I detected fragment-wise binning") elif bin_type == '"BIN"': logger.info("I detected fixed size binning") else: logger.warning(("Sorry, I don't understand this matrix's " "binning: I read {}".format(str(bin_type)))) header_data = [ header_elt.replace("'", "").replace('"', "").replace("\n", "").split("~") for header_elt in header[1:] ] ( global_frag_ids, contig_names, local_frag_ids, frag_starts, frag_ends, ) = np.array(list(zip(*header_data))) frag_starts = frag_starts.astype(np.int32) - 1 frag_ends = frag_ends.astype(np.int32) - 1 frag_lengths = frag_ends - frag_starts total_length = len(global_frag_ids) with open(output_contigs, "w") as info_contigs: info_contigs.write("contig\tlength\tn_frags\tcumul_length\n") cumul_length = 0 for contig in collections.OrderedDict.fromkeys(contig_names): length_tig = np.sum(frag_lengths[contig_names == contig]) n_frags = collections.Counter(contig_names)[contig] line_to_write = "%s\t%s\t%s\t%s\n" % ( contig, length_tig, n_frags, cumul_length, ) info_contigs.write(line_to_write) cumul_length += n_frags with open(output_frags, "w") as fragments_list: fragments_list.write("id\tchrom\tstart_pos\tend_pos" "\tsize\tgc_content\n") bogus_gc = 0.5 for i in range(total_length): line_to_write = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( int(local_frag_ids[i]) + 1, contig_names[i], frag_starts[i], frag_ends[i], frag_lengths[i], bogus_gc, ) fragments_list.write(line_to_write)
def sort_pairs(in_file, out_file, keys, tmp_dir=None, threads=1, buffer="2G"): """ Sort a pairs file in batches using UNIX sort. Parameters ---------- in_file : str Path to the unsorted input file out_file : str Path to the sorted output file. keys : list of str list of columns to use as sort keys. Each column can be one of readID, chr1, pos1, chr2, pos2, frag1, frag2. Key priorities are according to the order in the list. tmp_dir : str Path to the directory where temporary files will be created. Defaults to current directory. threads : int Number of parallel sorting threads. buffer : str Buffer size used for sorting. Consists of a number and a unit. """ # TODO: Write a pure python implementation to drop GNU coreutils depencency, # could be inspired from: https://stackoverflow.com/q/14465154/8440675 # Check if UNIX sort version supports parallelism parallel_ok = True sort_ver = sp.Popen(["sort", "--version"], stdout=sp.PIPE) sort_ver = (sort_ver.communicate()[0].decode().split("\n")[0].split(" ") [-1].split(".")) # If so, specify threads, otherwise don't mention it in the command line try: sort_ver = list(map(int, sort_ver)) if sort_ver[0] < 8 or (sort_ver[0] == 8 and sort_ver[1] < 23): logger.warning( "GNU sort version is {0} but >8.23 is required for parallel " "sort. Sorting on a single thread.".format(".".join( map(str, sort_ver)))) parallel_ok = False # BSD sort has a different format and will throw error upon parsing. It does # not support parallel processes anyway. except ValueError: logger.warning( "Using BSD sort instead of GNU sort, sorting on a single thread.") parallel_ok = False key_map = { "readID": "-k1,1d", "chr1": "-k2,2V", "pos1": "-k3,3n", "chr2": "-k4,4V", "pos2": "-k5,5n", "strand1": "-k6,6d", "strand2": "-k7,7d", "frag1": "-k8,8n", "frag2": "-k9,9n", } # transform column names to corresponding sort keys try: sort_keys = map(lambda k: key_map[k], keys) except KeyError: print("Unkown column name.") raise # Rewrite header with new sorting order header = get_pairs_header(in_file) with open(out_file, "w") as output: for line in header: if line.startswith("#sorted"): output.write("#sorted: {0}\n".format("-".join(keys))) else: output.write(line + "\n") # Sort pairs and append to file. with open(out_file, "a") as output: grep_proc = sp.Popen(["grep", "-v", "^#", in_file], stdout=sp.PIPE) sort_cmd = ["sort", "-S %s" % buffer] + list(sort_keys) if tmp_dir is not None: sort_cmd.append("--temporary-directory={0}".format(tmp_dir)) if parallel_ok: sort_cmd.append("--parallel={0}".format(threads)) sort_proc = sp.Popen(sort_cmd, stdin=grep_proc.stdout, stdout=output) sort_proc.communicate()
def attribute_fragments(pairs_file, idx_pairs_file, restriction_table): """ Writes the indexed pairs file, which has two more columns than the input pairs file corresponding to the restriction fragment index of each read. Note that pairs files have 1bp point positions whereas restriction table has 0bp point poisitions. Parameters ---------- pairs_file: str Path the the input pairs file. Consists of 7 white-space separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2 idx_pairs_file: str Path to the output indexed pairs file. Consists of 9 white space separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2, frag1, frag2. frag1 and frag2 are 0-based restriction fragments based on whole genome. restriction_table: dict Dictionary with chromosome identifiers (str) as keys and list of positions (int) of restriction sites as values. """ # NOTE: Bottlenecks here are 1. binary search in find_frag and 2. writerow # 1. could be reduced by searching groups of N frags in parallel and 2. by # writing N frags simultaneously using a single call of writerows. # Parse and update header section pairs_header = hio.get_pairs_header(pairs_file) header_size = len(pairs_header) chrom_order = [] with open(idx_pairs_file, "w") as idx_pairs: for line in pairs_header: # Add new column names to header if line.startswith("#columns"): line = line.rstrip() + " frag1 frag2" if line.startswith("#chromsize"): chrom_order.append(line.split()[1]) idx_pairs.write(line + "\n") # Get number of fragments per chrom to allow genome-based indices shift_frags = {} prev_frags = 0 for rank, chrom in enumerate(chrom_order): if rank > 0: # Note the "-1" because there are nfrags + 1 sites in rest table prev_frags += len(restriction_table[chrom_order[rank - 1]]) - 1 # Idx of each chrom's frags will be shifted by n frags in previous chroms shift_frags[chrom] = prev_frags missing_contigs = set() # Attribute pairs to fragments and append them to output file (after header) with open(pairs_file, "r") as pairs, open(idx_pairs_file, "a") as idx_pairs: # Skip header lines for _ in range(header_size): next(pairs) # Define input and output fields pairs_cols = [ "readID", "chr1", "pos1", "chr2", "pos2", "strand1", "strand2", ] idx_cols = pairs_cols + ["frag1", "frag2"] # Use csv reader / writer to automatically parse columns into a dict pairs_reader = csv.DictReader(pairs, fieldnames=pairs_cols, delimiter="\t") pairs_writer = csv.DictWriter(idx_pairs, fieldnames=idx_cols, delimiter="\t") for pair in pairs_reader: # Get the 0-based indices of corresponding restriction fragments # Deducing 1 from pair position to get it into 0bp point pair["frag1"] = find_frag( int(pair["pos1"]) - 1, restriction_table[pair["chr1"]]) pair["frag2"] = find_frag( int(pair["pos2"]) - 1, restriction_table[pair["chr2"]]) # Shift fragment indices to make them genome-based instead of # chromosome-based try: pair["frag1"] += shift_frags[pair["chr1"]] except KeyError: missing_contigs.add(pair["chr1"]) try: pair["frag2"] += shift_frags[pair["chr2"]] except KeyError: missing_contigs.add(pair["chr2"]) # Write indexed pairs in the new file pairs_writer.writerow(pair) if missing_contigs: logger.warning( "Pairs on the following contigs were discarded as " "those contigs are not listed in the paris file header. " "This is normal if you filtered out small contigs: %s" % " ".join(list(missing_contigs)))
def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30): """ Make a .pairs file from two Hi-C bam files sorted by read names. The Hi-C mates are matched by read identifier. Pairs where at least one reads maps with MAPQ below min_qual threshold are discarded. Pairs are sorted by readID and stored in upper triangle (first pair higher). Parameters ---------- bam1 : str Path to the name-sorted BAM file with aligned Hi-C forward reads. bam2 : str Path to the name-sorted BAM file with aligned Hi-C reverse reads. out_pairs : str Path to the output space-separated .pairs file with columns readID, chr1 pos1 chr2 pos2 strand1 strand2 info_contigs : str Path to the info contigs file, to get info on chromosome sizes and order. min_qual : int Minimum mapping quality required to keep a Hi-C pair. """ forward = ps.AlignmentFile(bam1, "rb") reverse = ps.AlignmentFile(bam2, "rb") # Generate header lines format_version = "## pairs format v1.0\n" sorting = "#sorted: readID\n" cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n" # Chromosome order will be identical in info_contigs and pair files chroms = pd.read_csv(info_contigs, sep="\t").apply(lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1) with open(out_pairs, "w") as pairs: pairs.writelines([format_version, sorting, cols] + chroms.tolist()) pairs_writer = csv.writer(pairs, delimiter="\t") n_reads = {"total": 0, "mapped": 0} # Remember if some read IDs were missing from either file unmatched_reads = 0 # Remember if all reads in one bam file have been read exhausted = [False, False] # Iterate on both BAM simultaneously for end1, end2 in itertools.zip_longest(forward, reverse): # Both file still have reads # Check if reads pass filter try: end1_passed = end1.mapping_quality >= min_qual # Happens if end1 bam file has been exhausted except AttributeError: exhausted[0] = True end1_passed = False try: end2_passed = end2.mapping_quality >= min_qual # Happens if end2 bam file has been exhausted except AttributeError: exhausted[1] = True end2_passed = False # Skip read if mate is not present until they match or reads # have been exhausted while sum(exhausted) == 0 and end1.query_name != end2.query_name: # Get next read and check filters again # Count single-read iteration unmatched_reads += 1 n_reads["total"] += 1 if end1.query_name < end2.query_name: try: end1 = next(forward) end1_passed = end1.mapping_quality >= min_qual # If EOF is reached in BAM 1 except (StopIteration, AttributeError): exhausted[0] = True end1_passed = False n_reads["mapped"] += end1_passed elif end1.query_name > end2.query_name: try: end2 = next(reverse) end2_passed = end2.mapping_quality >= min_qual # If EOF is reached in BAM 2 except (StopIteration, AttributeError): exhausted[1] = True end2_passed = False n_reads["mapped"] += end2_passed # 2 reads processed per iteration, unless one file is exhausted n_reads["total"] += 2 - sum(exhausted) n_reads["mapped"] += sum([end1_passed, end2_passed]) # Keep only pairs where both reads have good quality if end1_passed and end2_passed: # Flipping to get upper triangle if (end1.reference_id == end2.reference_id and end1.reference_start > end2.reference_start ) or end1.reference_id > end2.reference_id: end1, end2 = end2, end1 pairs_writer.writerow([ end1.query_name, end1.reference_name, end1.reference_start + 1, end2.reference_name, end2.reference_start + 1, "-" if end1.is_reverse else "+", "-" if end2.is_reverse else "+", ]) pairs.close() if unmatched_reads > 0: logger.warning( "%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.", unmatched_reads, ) logger.info( "{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})" .format( total=n_reads["total"], mapped=n_reads["mapped"], perc_map=round(100 * n_reads["mapped"] / n_reads["total"]), qual=min_qual, ))
def iterative_align( fq_in, tmp_dir, ref, n_cpu, bam_out, aligner="bowtie2", min_len=20, min_qual=30, read_len=None, ): """Iterative alignment Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are truncated to the 20 first nucleotides and unmapped reads are extended by 20 nucleotides and realigned on each iteration. Parameters ---------- fq_in : str Path to input fastq file to align iteratively. tmp_dir : str Path where temporary files should be written. ref : str Path to the reference genome if Minimap2 is used for alignment. Path to the index genome if Bowtie2/bwa is used for alignment. n_cpu : int The number of CPUs to use for the iterative alignment. bam_out : str Path where the final alignment should be written in BAM format. aligner : str Choose between minimap2, bwa or bowtie2 for the alignment. min_len : int The initial length of the fragments to align. min_qual : int Minimum mapping quality required to keep Hi-C pairs. read_len : int Read length in the fasta file. If set to None, the length of the first read is used. Set this value to the longest read length in the file if you have different read lengths. Examples -------- iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2") iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2") """ # set with the name of the unaligned reads : remaining_reads = set() total_reads = 0 # Store path of SAM containing aligned reads at each iteration. iter_out = [] # If there is already a file with the same name as the output file, # remove it. Otherwise, ignore. with contextlib.suppress(FileNotFoundError): try: os.remove(bam_out) except IsADirectoryError: logger.error("You need to give the BAM output file, not a folder.") raise # Bowtie only accepts uncompressed fastq: uncompress it into a temp file if aligner == "bowtie2" and hio.is_compressed(fq_in): uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp") with hio.read_compressed(fq_in) as inf: with open(uncomp_path, "w") as uncomp: st.copyfileobj(inf, uncomp) else: uncomp_path = fq_in # throw error if index does not exist index = hio.check_fasta_index(ref, mode=aligner) if index is None: logger.error( "Reference index is missing, please build the {} ".format(aligner), "index first.") sys.exit(1) # Counting reads with hio.read_compressed(uncomp_path) as inf: for _ in inf: total_reads += 1 total_reads /= 4 # Use first read to guess read length if not provided. if read_len is None: with hio.read_compressed(uncomp_path) as inf: # Skip first line (read header) size = inf.readline() # Stripping newline from sequence line. read_len = len(inf.readline().rstrip()) # initial length of the fragments to align # In case reads are shorter than provided min_len if read_len > min_len: n = min_len else: logger.warning( "min_len is longer than the reads. Iterative mapping will have no effect." ) n = read_len logger.info("{0} reads to parse".format(int(total_reads))) first_round = True # iterative alignment per se while n <= read_len: logger.info( "Truncating unaligned reads to {size}bp and mapping{again}.". format(size=int(n), again="" if first_round else " again")) iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] # Generate a temporary input fastq file with the n first nucleotids # of the reads. truncated_reads = truncate_reads(tmp_dir, uncomp_path, remaining_reads, n, first_round) # Align the truncated reads on reference genome temp_alignment = join(tmp_dir, "temp_alignment.bam") map_args = { "fa": ref, "cpus": n_cpu, "fq": truncated_reads, "idx": index, "bam": temp_alignment, } if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE): cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args) elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE): cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args) elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE): cmd = ("bowtie2 -x {idx} -p {cpus}" " --quiet --very-sensitive {fq}").format(**map_args) else: raise ValueError( "Unknown aligner. Select bowtie2, minimap2 or bwa.") map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) sort_process = sp.Popen( "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args), shell=True, stdin=map_process.stdout, ) out, err = sort_process.communicate() # filter the reads: the reads whose truncated end was aligned are written # to the output file. # The reads whose truncated end was not aligned are kept for the next round. remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) n += 20 first_round = False # one last round without trimming logger.info("Trying to map unaligned reads at full length ({0}bp).".format( int(read_len))) truncated_reads = truncate_reads( tmp_dir, infile=uncomp_path, unaligned_set=remaining_reads, trunc_len=n, first_round=first_round, ) if aligner == "minimap2" or aligner == "Minimap2": cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format( fa=ref, cpus=n_cpu, fq=truncated_reads) elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA": cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format(idx=index, cpus=n_cpu, fq=truncated_reads) else: cmd = ("bowtie2 -x {idx} -p {cpus} --quiet " "--very-sensitive {fq}").format(idx=index, cpus=n_cpu, fq=truncated_reads) map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) # Keep reads sorted by name sort_process = sp.Popen( "samtools sort -n -@ {cpus} -O BAM -o {bam}".format( cpus=n_cpu, bam=temp_alignment), shell=True, stdin=map_process.stdout, ) out, err = sort_process.communicate() iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) # Report unaligned reads as well iter_out += [join(tmp_dir, "unaligned.bam")] temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam) for r in temp_bam: # Do not write supplementary alignments (keeping 1 alignment/read) if r.query_name in remaining_reads and not r.is_supplementary: unmapped.write(r) unmapped.close() temp_bam.close() # Merge all aligned reads and unmapped reads into a single bam ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out) logger.info("{0} reads aligned / {1} total reads.".format( int(total_reads - len(remaining_reads)), int(total_reads))) return 0
def full_pipeline( genome, input1, input2=None, aligner="bowtie2", centromeres=None, circular=False, distance_law=False, enzyme=5000, filter_events=False, force=False, mapping="normal", mat_fmt="graal", min_qual=30, min_size=0, no_cleanup=False, out_dir=None, pcr_duplicates=False, plot=False, prefix=None, read_len=None, remove_centros=None, start_stage="fastq", threads=1, tmp_dir=None, ): """ Run the whole hicstuff pipeline. Starting from fastq files and a genome to obtain a contact matrix. Parameters ---------- genome : str Path to the bowtie2/bwa index prefix if using bowtie2/bwa or to the genome in fasta format if using minimap2. input1 : str Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads in BAM format, or the pairs file, depending on the value of start_stage. input2 : str Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads in BAM format, or None, depending on the value of start_stage. enzyme : int or strtest_data/genome/seq.fa circular : bool Use if the genome is circular. out_dir : str or None Path where output files should be written. Current directory by default. tmp_dir : str or None Path where temporary files will be written. Creates a "tmp" folder in out_dir by default. plot : bool Whether plots should be generated at different steps of the pipeline. Plots are saved in a "plots" directory inside out_dir. min_qual : int Minimum mapping quality required to keep a pair of Hi-C reads. min_size : int Minimum contig size required to keep it. threads : int Number of threads to use for parallel operations. no_cleanup : bool Whether temporary files should be deleted at the end of the pipeline. mapping : str normal|iterative|cutsite. Use normal, iterative or cutsite mapping. "normal": Normal alignement. "iterative": Truncates and extends reads until unambiguous alignment. "cutsite": Digest reads at religation sites and build new pairs from the fragments created. filter_events : bool Filter spurious or uninformative 3C events. Requires a restriction enzyme. force : bool If True, overwrite existing files with the same name as output. prefix : str or None Choose a common name for output files instead of default graal names. start_stage : str Step at which the pipeline should start. Can be "fastq", "bam", "pairs" or "pairs_idx". With starting from bam allows to skip alignment and start from named-sorted bam files. With "pairs", a single pairs file is given as input, and with "pairs_idx", the pairs in the input must already be attributed to fragments and fragment attribution is skipped. mat_fmt : str Select the output matrix format. Can be either "bg2" for the bedgraph2 format, "cool" for Mirnylab's cool format, or graal for a plain text COO format compatible with Koszullab's instagraal software. aligner : str Read alignment software to use. Can be either "minimap2", "bwa" or "bowtie2". pcr_duplicates : bool If True, PCR duplicates will be filtered based on genomic positions. Pairs where both reads have exactly the same coordinates are considered duplicates and only one of those will be conserved. distance_law : bool If True, generates a distance law file with the values of the probabilities to have a contact between two distances for each chromosomes or arms if the file with the positions has been given. The values are not normalized, or averaged. centromeres : None or str If not None, path of file with Positions of the centromeres separated by a space and in the same order than the chromosomes. read_len : int Maximum read length to expect in the fastq file. Optionally used in iterative alignment mode. Estimated from the first read by default. Useful if input fastq is a composite of different read lengths. remove_centros : None or int If the distance law is computed, this is the number of kb that will be removed around the centromere position given by in the centromere file. """ # Check if third parties can be run if aligner in ("bowtie2", "minimap2", "bwa"): if (not check_tool(aligner)) | (check_tool(aligner) is None): logger.error("%s is not installed or not on PATH", aligner) raise ImportError(f"{aligner} is required.") else: logger.error("Incompatible aligner software, choose bowtie2, minimap2 or bwa.") raise ValueError("aligner should be either bowtie2, minimap2 or bwa.") if (not check_tool("samtools")) | (check_tool("samtools") is None): logger.error("samtools is not installed or not on PATH") raise ImportError("samtools is required.") if mat_fmt == 'cool': try: import cooler except ImportError: logger.error( "The cooler package is require to return matrix in cool format, please install it first." ) raise ImportError("The cooler package is required.") # Pipeline can start from 3 input types start_time = datetime.now() stages = {"fastq": 0, "bam": 1, "pairs": 2, "pairs_idx": 3} start_stage = stages[start_stage] # Check if the number of input files is correct if start_stage <= 1: if input2 is None: logger.error( "You must provide 2 input files when --start-stage is fastq " "or bam." ) sys.exit(1) else: if input2 is not None: logger.error( "You must provide a single input file when --start-stage is " "pairs or pairs_idx." ) sys.exit(1) # sanitize enzyme enzyme = str(enzyme) # Remember whether fragments_file has been generated during this run fragments_updated = False if out_dir is None: out_dir = os.getcwd() if tmp_dir is None: tmp_dir = join(out_dir, "tmp") os.makedirs(out_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) # Define figures output paths if plot: fig_dir = join(out_dir, "plots") os.makedirs(fig_dir, exist_ok=True) if prefix: frag_plot = join(fig_dir, prefix + "_frags_hist.pdf") dist_plot = join(fig_dir, prefix + "_event_distance.pdf") pie_plot = join(fig_dir, prefix + "_event_distribution.pdf") distance_law_plot = join(fig_dir, prefix + "_distance_law.pdf") else: frag_plot = join(fig_dir, "frags_hist.pdf") dist_plot = join(fig_dir, "event_distance.pdf") pie_plot = join(fig_dir, "event_distribution.pdf") distance_law_plot = join(fig_dir, "distance_law.pdf") matplotlib.use("Agg") else: fig_dir = None dist_plot = pie_plot = frag_plot = None # Use current time for logging and to identify files now = time.strftime("%Y%m%d%H%M%S") def _tmp_file(fname): if prefix: fname = prefix + "." + fname full_path = join(tmp_dir, fname) if not force and os.path.exists(full_path): raise IOError( "Temporary file {} already exists. Use --force to overwrite".format( full_path ) ) return full_path def _out_file(fname): if prefix: fname = prefix + "." + fname full_path = join(out_dir, fname) if not force and os.path.exists(full_path): raise IOError( "Output file {} already exists. Use --force to overwrite".format( full_path ) ) return full_path # Define temporary file names log_file = _out_file("hicstuff_" + now + ".log") tmp_genome = _tmp_file("genome.fa.gz") bam1 = _tmp_file("for.bam") bam2 = _tmp_file("rev.bam") pairs = _tmp_file("valid.pairs") pairs_idx = _tmp_file("valid_idx.pairs") pairs_filtered = _tmp_file("valid_idx_filtered.pairs") pairs_pcr = _tmp_file("valid_idx_pcrfree.pairs") # Enable file logging hcl.set_file_handler(log_file) generate_log_header(log_file, input1, input2, genome, enzyme) # If the user chose bowtie2 and supplied an index, extract fasta from it # For later steps of the pipeline (digestion / frag attribution) # Check if the genome is an index or fasta file idx = hio.check_fasta_index(genome, mode=aligner) is_fasta = hio.check_is_fasta(genome) # Different aligners accept different files. Make sure the input format is good. # Note bowtie2 can extract fasta from the index, but bwa cannot sane_input = { 'bowtie2': is_fasta or idx, 'minimap2': is_fasta, 'bwa': is_fasta } if not sane_input[aligner]: logger.error("You must provide either a fasta or bowtie2 index prefix as genome") # Just use the input genome if it is indexed if is_fasta and idx: fasta = genome # Otherwise copy it in tmpdir (in compressed format) for indexing, unless the input is a # bt2 index, in which case fasta will be extracted later from it. else: if is_fasta: with hio.read_compressed(genome, 'rb') as src, gzip.open(tmp_genome, 'wb') as dst: dst.writelines(src) genome = tmp_genome fasta = tmp_genome # Bowtie2-specific feature: extract fasta from the index if aligner == 'bowtie2' and not is_fasta: # Index is present, extract fasta file from it and compress it bt2fa = sp.Popen( ["bowtie2-inspect", genome], stdout=sp.PIPE, stderr=sp.PIPE, ) _ = sp.run(['gzip', '-c'], stdin=bt2fa.stdout, stdout=open(tmp_genome, "w")) _, bt2err = bt2fa.communicate() # bowtie2-inspect still has return code 0 when crashing, need to # actively look for error in stderr if re.search(r"[Ee]rror", bt2err.decode()): logger.error(bt2err) logger.error( "bowtie2-inspect has failed, make sure you provided " "the path to the bowtie2 index without the extension." ) sys.exit(1) # Build index with bowtie2 / bwa if required if idx is None and aligner in ['bowtie2', 'bwa']: if aligner == 'bowtie2': index_cmd = ["bowtie2-build", '-q', fasta, fasta] elif aligner == 'bwa': index_cmd = ['bwa', 'index', fasta] # We only need the index if the user provided fastq input if start_stage == 0: # If no index present assume input is fasta, copy it in tmp and # index it (to avoid conflict between instances) logger.info( "%s index not found at %s, generating " "a local temporary index.", aligner, genome ) sp.run(index_cmd, stderr=sp.PIPE) # Check for spaces in fasta headers and issue error if found for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"): if " " in record.id: logger.error( "Sequence identifiers contain spaces. Please clean the input genome." ) # Define output file names (tsv files) if prefix: fragments_list = _out_file("frags.tsv") info_contigs = _out_file("chr.tsv") mat = _out_file("mat.tsv") # If matrix has a different format, give it the right extension if mat_fmt != "graal": mat = _out_file(mat_fmt) else: # Default graal file names fragments_list = _out_file("fragments_list.txt") info_contigs = _out_file("info_contigs.txt") mat = _out_file("abs_fragments_contacts_weighted.txt") if mat_fmt != "graal": mat = _out_file("abs_fragments_contacts_weighted." + mat_fmt) # Define what input files are given if start_stage == 0: reads1, reads2 = input1, input2 elif start_stage == 1: bam1, bam2 = input1, input2 elif start_stage == 2: pairs = input1 elif start_stage == 3: pairs_idx = input1 # Perform genome alignment if start_stage == 0: # Define mapping choice (default normal): if mapping == "normal": iterative = False elif mapping == "iterative": iterative = True elif mapping == "cutsite": # If no enzyme given use iterative alignment. try: int(enzyme) logger.warning("No enzyme has been given. Can't map using cutsite, iterative mapping will be used instead.") iterative = True # If cutsite enabled and enzyme given, cut the reads before making a # normal alignment. except ValueError: iterative = False digest_for = _tmp_file("digest_for.fq.gz") digest_rev = _tmp_file("digest_rev.fq.gz") hcc.cut_ligation_sites( fq_for=reads1, fq_rev=reads2, digest_for=digest_for, digest_rev=digest_rev, enzyme=enzyme, mode="for_vs_rev", seed_size=20, n_cpu=threads, ) reads1, reads2 = digest_for, digest_rev else: logger.error("mapping must be either normal, iterative or cutsite.") raise ValueError align_reads( reads1, genome, bam1, tmp_dir=tmp_dir, threads=threads, aligner=aligner, iterative=iterative, min_qual=min_qual, read_len=read_len, ) align_reads( reads2, genome, bam2, tmp_dir=tmp_dir, threads=threads, aligner=aligner, iterative=iterative, min_qual=min_qual, read_len=read_len, ) # Detect if multiple enzymes are given if re.search(",", enzyme): enzyme = enzyme.split(",") # Starting from bam files if start_stage <= 1: fragments_updated = True # Generate info_contigs and fragments_list output files hcd.write_frag_info( fasta, enzyme, min_size=min_size, circular=circular, output_contigs=info_contigs, output_frags=fragments_list, ) # Log fragment size distribution hcd.frag_len(frags_file_name=fragments_list, plot=plot, fig_path=frag_plot) # Make pairs file (readID, chr1, chr2, pos1, pos2, strand1, strand2) bam2pairs(bam1, bam2, pairs, info_contigs, min_qual=min_qual) # Starting from pairs file if start_stage <= 2: restrict_table = {} for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"): # Get chromosome restriction table restrict_table[record.id] = hcd.get_restriction_table( record.seq, enzyme, circular=circular ) # Add fragment index to pairs (readID, chr1, pos1, chr2, # pos2, strand1, strand2, frag1, frag2) hcd.attribute_fragments(pairs, pairs_idx, restrict_table) # Sort pairs file by coordinates for next steps hio.sort_pairs( pairs_idx, pairs_idx + ".sorted", keys=["chr1", "pos1", "chr2", "pos2"], threads=threads, tmp_dir=tmp_dir, ) os.rename(pairs_idx + ".sorted", pairs_idx) if filter_events: uncut_thr, loop_thr = hcf.get_thresholds( pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix ) hcf.filter_events( pairs_idx, pairs_filtered, uncut_thr, loop_thr, plot_events=plot, fig_path=pie_plot, prefix=prefix, ) use_pairs = pairs_filtered else: use_pairs = pairs_idx # Generate fragments file if it has not been already if not fragments_updated: hcd.write_frag_info( fasta, enzyme, min_size=min_size, circular=circular, output_contigs=info_contigs, output_frags=fragments_list, ) # Generate distance law table if enabled if distance_law: out_distance_law = _out_file("distance_law.txt") if remove_centros is None: remove_centros = 0 remove_centros = int(remove_centros) x_s, p_s, _ = hcdl.get_distance_law( pairs_idx, fragments_list, centro_file=centromeres, base=1.1, out_file=out_distance_law, circular=circular, rm_centro=remove_centros, ) # Generate distance law figure is plots are enabled if plot: # Retrieve chrom labels from distance law file _, _, chr_labels = hcdl.import_distance_law(out_distance_law) chr_labels = [lab[0] for lab in chr_labels] chr_labels_idx = np.unique(chr_labels, return_index=True)[1] chr_labels = [chr_labels[index] for index in sorted(chr_labels_idx)] p_s = hcdl.normalize_distance_law(x_s, p_s) hcdl.plot_ps_slope(x_s, p_s, labels=chr_labels, fig_path=distance_law_plot) # Filter out PCR duplicates if requested if pcr_duplicates: filter_pcr_dup(use_pairs, pairs_pcr) use_pairs = pairs_pcr # Build matrix from pairs. if mat_fmt == "cool": # Name matrix file in .cool cool_file = os.path.splitext(mat)[0] + ".cool" pairs2cool(use_pairs, cool_file, fragments_list) else: pairs2matrix( use_pairs, mat, fragments_list, mat_fmt=mat_fmt, threads=threads, tmp_dir=tmp_dir, ) # Clean temporary files if not no_cleanup: tempfiles = [ pairs, pairs_idx, pairs_filtered, bam1, bam2, pairs_pcr, tmp_genome, ] # Do not delete files that were given as input try: tempfiles.remove(input1) tempfiles.remove(input2) except ValueError: pass for file in tempfiles: try: os.remove(file) except FileNotFoundError: pass end_time = datetime.now() duration = relativedelta(end_time, start_time) logger.info( "Contact map generated after {h}h {m}m {s}s".format( h=duration.hours, m=duration.minutes, s=duration.seconds ) )