def filter_pcr_dup(pairs_idx_file, filtered_file): """ Filter out PCR duplicates from a coordinate-sorted pairs file using overrrepresented exact coordinates. If multiple fragments have two reads with the exact same coordinates, only one of those fragments is kept. Parameters ---------- pairs_idx_file : str Path to an indexed pairs file containing the Hi-C reads. filtered_file : str Path to the output pairs file after removing duplicates. """ # Keep count of how many reads are filtered filter_count = 0 reads_count = 0 # Store header lines header = hio.get_pairs_header(pairs_idx_file) with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered: # Copy header lines to filtered file for head_line in header: filtered.write(head_line + "\n") next(pairs) # Use csv methods to easily access columns paircols = [ "readID", "chr1", "pos1", "chr2", "pos2", "strand1", "strand2", "frag1", "frag2", ] # Columns used for comparison of coordinates coord_cols = [col for col in paircols if col != "readID"] pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols) filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols) # Initialise a variable to store coordinates of reads in previous pair prev = {k: 0 for k in paircols} for pair in pair_reader: reads_count += 1 # If coordinates are the same as before, skip pair if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols): filter_count += 1 continue # Else write pair and store new coordinates as previous else: filt_writer.writerow(pair) prev = pair logger.info( "%d%% PCR duplicates have been filtered out (%d / %d pairs) " % (100 * round(filter_count / reads_count, 3), filter_count, reads_count) )
def filter_bamfile(temp_alignment, filtered_out, min_qual=30): """Filter alignment BAM files Reads all the reads in the input BAM alignment file. Write reads to the output file if they are aligned with a good quality, otherwise add their name in a set to stage them for the next round of alignment. Parameters ---------- temp_alignment : str Path to the input temporary alignment. outfile : str Path to the output filtered temporary alignment. min_qual : int Minimum mapping quality required to keep a Hi-C pair. Returns ------- set: Contains the names reads that did not align. """ # Check the quality and status of each aligned fragment. # Write the ones with good quality in the final output file. # Keep those that do not map unambiguously for the next round. unaligned = set() temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) outf = ps.AlignmentFile(filtered_out, "wb", template=temp_bam) for r in temp_bam: if r.flag in [0, 16] and r.mapping_quality >= min_qual: outf.write(r) else: unaligned.add(r.query_name) logger.info("{0} reads left to map.".format(len(unaligned))) temp_bam.close() outf.close() return unaligned
def to_dade_matrix(M, annotations="", filename=None): """Returns a Dade matrix from input numpy matrix. Any annotations are added as header. If filename is provided and valid, said matrix is also saved as text. """ n, m = M.shape A = np.zeros((n + 1, m + 1)) A[1:, 1:] = M if not annotations: annotations = np.array(["" for _ in n], dtype=str) A[0, :] = annotations A[:, 0] = annotations.T if filename: try: np.savetxt(filename, A, fmt="%i") logger.info("I saved input matrix in dade format as {0}".format( str(filename))) except ValueError as e: logger.warning("I couldn't save input matrix.") logger.warning(str(e)) return A
def dade_to_graal( filename, output_matrix=DEFAULT_SPARSE_MATRIX_FILE_NAME, output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, output_frags=DEFAULT_SPARSE_MATRIX_FILE_NAME, output_dir=None, ): """Convert a matrix from DADE format (https://github.com/scovit/dade) to a graal-compatible format. Since DADE matrices contain both fragment and contact information all files are generated at the same time. """ with open(output_matrix, "w") as sparse_file: sparse_file.write("id_frag_a\tid_frag_b\tn_contact") with open(filename) as file_handle: first_line = file_handle.readline() for row_index, line in enumerate(file_handle): dense_row = np.array(line.split("\t")[1:], dtype=np.int32) for col_index in np.nonzero(dense_row)[0]: line_to_write = "{}\t{}\t{}\n".format( row_index, col_index, dense_row[col_index]) sparse_file.write(line_to_write) header = first_line.split("\t") bin_type = header[0] if bin_type == '"RST"': logger.info("I detected fragment-wise binning") elif bin_type == '"BIN"': logger.info("I detected fixed size binning") else: logger.warning(("Sorry, I don't understand this matrix's " "binning: I read {}".format(str(bin_type)))) header_data = [ header_elt.replace("'", "").replace('"', "").replace("\n", "").split("~") for header_elt in header[1:] ] ( global_frag_ids, contig_names, local_frag_ids, frag_starts, frag_ends, ) = np.array(list(zip(*header_data))) frag_starts = frag_starts.astype(np.int32) - 1 frag_ends = frag_ends.astype(np.int32) - 1 frag_lengths = frag_ends - frag_starts total_length = len(global_frag_ids) with open(output_contigs, "w") as info_contigs: info_contigs.write("contig\tlength\tn_frags\tcumul_length\n") cumul_length = 0 for contig in collections.OrderedDict.fromkeys(contig_names): length_tig = np.sum(frag_lengths[contig_names == contig]) n_frags = collections.Counter(contig_names)[contig] line_to_write = "%s\t%s\t%s\t%s\n" % ( contig, length_tig, n_frags, cumul_length, ) info_contigs.write(line_to_write) cumul_length += n_frags with open(output_frags, "w") as fragments_list: fragments_list.write("id\tchrom\tstart_pos\tend_pos" "\tsize\tgc_content\n") bogus_gc = 0.5 for i in range(total_length): line_to_write = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( int(local_frag_ids[i]) + 1, contig_names[i], frag_starts[i], frag_ends[i], frag_lengths[i], bogus_gc, ) fragments_list.write(line_to_write)
def frag_len( frags_file_name=DEFAULT_FRAGMENTS_LIST_FILE_NAME, output_dir=None, plot=False, fig_path=None, ): """ logs summary statistics of fragment length distribution based on an input fragment file. Can optionally show a histogram instead of text summary. Parameters ---------- frags_file_name : str Path to the output list of fragments. output_dir : str Directory where the list should be saved. plot : bool Wether a histogram of fragment length should be shown. fig_path : str If a path is given, the figure will be saved instead of shown. """ try: frag_list_path = os.path.join(output_dir, frags_file_name) except TypeError: frag_list_path = frags_file_name frags = pd.read_csv(frag_list_path, sep="\t") nfrags = frags.shape[0] med_len = frags["size"].median() nbins = 40 if plot: fig, ax = plt.subplots() _, _, _ = ax.hist(frags["size"], bins=nbins) ax.set_xlabel("Fragment length [bp]") ax.set_ylabel("Log10 number of fragments") ax.set_title("Distribution of restriction fragment length") ax.set_yscale("log", basey=10) ax.annotate( "Total fragments: {}".format(nfrags), xy=(0.95, 0.95), xycoords="axes fraction", fontsize=12, horizontalalignment="right", verticalalignment="top", ) ax.annotate( "Median length: {}".format(med_len), xy=(0.95, 0.90), xycoords="axes fraction", fontsize=12, horizontalalignment="right", verticalalignment="top", ) # Tweak spacing to prevent clipping of ylabel fig.tight_layout() if fig_path: plt.savefig(fig_path) else: plt.show() plt.clf() else: logger.info("Genome digested into {0} fragments with a median " "length of {1}".format(nfrags, med_len))
def cut_ligation_sites( fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu ): """Create new reads to manage pairs with a digestion and create multiple pairs to take into account all the contact present. The function write two files for both the forward and reverse fastq with the new reads. The new reads have at the end of the ID ":[0-9]" added to differentiate the different pairs created from one read. The function will look for all the sites present and create new pairs of reads according to the mode given to retreive as much as possible of the HiC signal. Parameters ---------- fq_for : str Path to the forward fastq file to digest. fq_rev : str Path to the reverse fatsq file to digest. digest_for : str Path to the output digested forward fatsq file to write. digest_rev : str Path to the output digested reverse fatsq file to write. enzyme : str The list of restriction enzyme used to digest the genome separated by a comma. Example: HpaII,MluCI. mode : str Mode to use to make the digestion. Three values possible: "all", "for_vs_rev", "pile". seed_size : int Minimum size of a fragment (i.e. seed size used in mapping as reads smaller won't be mapped.) n_cpu : int Number of CPUs. """ # Process the ligation sites given ligation_sites = hcd.gen_enzyme_religation_regex(enzyme) # Defined stop_token which is used to mark the end of input file stop_token = "STOP" # A stack is a string cointaining multiple read pairs max_stack_size = 1000 # Create count to have an idea of the digested pairs repartition. original_number_of_pairs = 0 final_number_of_pairs = 0 new_reads_for = "" new_reads_rev = "" current_stack = 0 # Start parallel threading to compute the # ctx = multiprocessing.get_context("spawn") queue = multiprocessing.Queue(max(1, n_cpu - 1)) writer_process = multiprocessing.Process( target=_writer, args=(digest_for, digest_rev, queue, stop_token) ) writer_process.start() # Iterate on all pairs for read_for, read_rev in zip( pyfastx.Fastq(fq_for, build_index=False), pyfastx.Fastq(fq_rev, build_index=False), ): # Count the numbers of original reads processed. original_number_of_pairs += 1 # Count for stack size. current_stack += 1 # Extract components of the reads. for_name, for_seq, for_qual = read_for rev_name, rev_seq, rev_qual = read_rev # Sanity check to be sure all reads are with their mate. if for_name != rev_name: logger.error( "The fastq files contains reads not sorted :\n{0}\n{1}".format( read_for.id, read_rev.id ) ) sys.exit(1) # Cut the forward and reverse reads at the ligation sites. for_seq_list, for_qual_list = cutsite_read( ligation_sites, for_seq, for_qual, seed_size, ) rev_seq_list, rev_qual_list = cutsite_read( ligation_sites, rev_seq, rev_qual, seed_size, ) # Write the new combinations of fragments. new_reads_for, new_reads_rev, final_number_of_pairs = write_pair( new_reads_for, new_reads_rev, for_name, for_seq_list, for_qual_list, rev_seq_list, rev_qual_list, mode, final_number_of_pairs, ) # If stack full, add it in the queue. if current_stack == max_stack_size: # Add the pair in the queue. pairs = (new_reads_for.encode(), new_reads_rev.encode()) queue.put(pairs) # Empty the stack current_stack = 0 new_reads_for = "" new_reads_rev = "" # End the parallel processing. pairs = (new_reads_for.encode(), new_reads_rev.encode()) queue.put(pairs) queue.put(stop_token) writer_process.join() # Return information on the different pairs created logger.info(f"Library used: {fq_for} - {fq_rev}") logger.info( f"Number of pairs before digestion: {original_number_of_pairs}" ) logger.info( f"Number of pairs after digestion: {final_number_of_pairs}" )
def full_pipeline( genome, input1, input2=None, aligner="bowtie2", centromeres=None, circular=False, distance_law=False, enzyme=5000, filter_events=False, force=False, iterative=False, mat_fmt="graal", min_qual=30, min_size=0, no_cleanup=False, out_dir=None, pcr_duplicates=False, plot=False, prefix=None, read_len=None, remove_centros=None, start_stage="fastq", threads=1, tmp_dir=None, ): """ Run the whole hicstuff pipeline. Starting from fastq files and a genome to obtain a contact matrix. Parameters ---------- genome : str Path to the bowtie2/bwa index prefix if using bowtie2/bwa or to the genome in fasta format if using minimap2. input1 : str Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads in BAM format, or the pairs file, depending on the value of start_stage. input2 : str Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads in BAM format, or None, depending on the value of start_stage. enzyme : int or str Name of the enzyme used for the digestion (e.g "DpnII"). If an integer is used instead, the fragment attribution will be done directly using a fixed chunk size. circular : bool Use if the genome is circular. out_dir : str or None Path where output files should be written. Current directory by default. tmp_dir : str or None Path where temporary files will be written. Creates a "tmp" folder in out_dir by default. plot : bool Whether plots should be generated at different steps of the pipeline. Plots are saved in a "plots" directory inside out_dir. min_qual : int Minimum mapping quality required to keep a pair of Hi-C reads. min_size : int Minimum contig size required to keep it. threads : int Number of threads to use for parallel operations. no_cleanup : bool Whether temporary files should be deleted at the end of the pipeline. iterative : bool Use iterative mapping. Truncates and extends reads until unambiguous alignment. filter_events : bool Filter spurious or uninformative 3C events. Requires a restriction enzyme. force : bool If True, overwrite existing files with the same name as output. prefix : str or None Choose a common name for output files instead of default graal names. start_stage : str Step at which the pipeline should start. Can be "fastq", "bam", "pairs" or "pairs_idx". With starting from bam allows to skip alignment and start from named-sorted bam files. With "pairs", a single pairs file is given as input, and with "pairs_idx", the pairs in the input must already be attributed to fragments and fragment attribution is skipped. mat_fmt : str Select the output matrix format. Can be either "bg2" for the bedgraph2 format, "cool" for Mirnylab's cool format, or graal for a plain text COO format compatible with Koszullab's instagraal software. aligner : str Read alignment software to use. Can be either "minimap2", "bwa" or "bowtie2". pcr_duplicates : bool If True, PCR duplicates will be filtered based on genomic positions. Pairs where both reads have exactly the same coordinates are considered duplicates and only one of those will be conserved. distance_law : bool If True, generates a distance law file with the values of the probabilities to have a contact between two distances for each chromosomes or arms if the file with the positions has been given. The values are not normalized, or averaged. centromeres : None or str If not None, path of file with Positions of the centromeres separated by a space and in the same order than the chromosomes. read_len : int Maximum read length to expect in the fastq file. Optionally used in iterative alignment mode. Estimated from the first read by default. Useful if input fastq is a composite of different read lengths. remove_centros : None or int If the distance law is computed, this is the number of kb that will be removed around the centromere position given by in the centromere file. """ # Check if third parties can be run if aligner in ("bowtie2", "minimap2", "bwa"): if check_tool(aligner) is None: logger.error("%s is not installed or not on PATH", aligner) sys.exit(1) else: logger.error( "Incompatible aligner software, choose bowtie2, minimap2 or bwa") sys.exit(1) if check_tool("samtools") is None: logger.error("Samtools is not installed or not on PATH") sys.exit(1) # Pipeline can start from 3 input types start_time = datetime.now() stages = {"fastq": 0, "bam": 1, "pairs": 2, "pairs_idx": 3} start_stage = stages[start_stage] # Check if the number of input files is correct if start_stage <= 1: if input2 is None: logger.error( "You must provide 2 input files when --start-stage is fastq " "or bam.") sys.exit(1) else: if input2 is not None: logger.error( "You must provide a single input file when --start-stage is " "pairs or pairs_idx.") sys.exit(1) # sanitize enzyme enzyme = str(enzyme) # Remember whether fragments_file has been generated during this run fragments_updated = False if out_dir is None: out_dir = os.getcwd() if tmp_dir is None: tmp_dir = join(out_dir, "tmp") os.makedirs(out_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) # Define figures output paths if plot: fig_dir = join(out_dir, "plots") os.makedirs(fig_dir, exist_ok=True) frag_plot = join(fig_dir, "frags_hist.pdf") dist_plot = join(fig_dir, "event_distance.pdf") pie_plot = join(fig_dir, "event_distribution.pdf") distance_law_plot = join(fig_dir, "distance_law.pdf") matplotlib.use("Agg") else: fig_dir = None dist_plot = pie_plot = frag_plot = None # Use current time for logging and to identify files now = time.strftime("%Y%m%d%H%M%S") def _tmp_file(fname): if prefix: fname = prefix + "." + fname full_path = join(tmp_dir, fname) if not force and os.path.exists(full_path): raise IOError( "Temporary file {} already exists. Use --force to overwrite". format(full_path)) return full_path def _out_file(fname): if prefix: fname = prefix + "." + fname full_path = join(out_dir, fname) if not force and os.path.exists(full_path): raise IOError( "Output file {} already exists. Use --force to overwrite". format(full_path)) return full_path # Define temporary file names log_file = _out_file("hicstuff_" + now + ".log") tmp_genome = _tmp_file("genome.fasta") bam1 = _tmp_file("for.bam") bam2 = _tmp_file("rev.bam") pairs = _tmp_file("valid.pairs") pairs_idx = _tmp_file("valid_idx.pairs") pairs_filtered = _tmp_file("valid_idx_filtered.pairs") pairs_pcr = _tmp_file("valid_idx_pcrfree.pairs") # Enable file logging hcl.set_file_handler(log_file) generate_log_header(log_file, input1, input2, genome, enzyme) # If the user chose bowtie2 and supplied an index, extract fasta from it # For later steps of the pipeline (digestion / frag attribution) # Check if the genome is an index or fasta file idx = hio.check_fasta_index(genome, mode=aligner) is_fasta = hio.check_is_fasta(genome) # Different aligners accept different files. Make sure the input format is good. # Note bowtie2 can extract fasta from the index, but bwa cannot sane_input = { 'bowtie2': is_fasta or idx, 'minimap2': is_fasta, 'bwa': is_fasta } if not sane_input[aligner]: logger.error( "You must provide either a fasta or bowtie2 index prefix as genome" ) # Just use the input genome is it is indexed if is_fasta and idx: fasta = genome # Otherwise copy it in tmpdir for indexing, unless the input is a bt2 index, in which # case fasta will be extracted later from it. else: if is_fasta: st.copy(genome, tmp_genome) genome = tmp_genome fasta = tmp_genome # Bowtie2-specific feature: extract fasta from the index if aligner == 'bowtie2' and not is_fasta: # Index is present, extract fasta file from it bt2fa = sp.Popen( ["bowtie2-inspect", genome], stdout=open(tmp_genome, "w"), stderr=sp.PIPE, ) _, bt2err = bt2fa.communicate() # bowtie2-inspect still has return code 0 when crashing, need to # actively look for error in stderr if re.search(r"[Ee]rror", bt2err.decode()): logger.error(bt2err) logger.error( "bowtie2-inspect has failed, make sure you provided " "the path to the bowtie2 index without the extension.") sys.exit(1) # Build index with bowtie2 / bwa if required if idx is None and aligner in ['bowtie2', 'bwa']: if aligner == 'bowtie2': index_cmd = ["bowtie2-build", '-q', fasta, fasta] elif aligner == 'bwa': index_cmd = ['bwa', 'index', fasta] # We only need the index if the user provided fastq input if start_stage == 0: # If no index present assume input is fasta, copy it in tmp and # index it (to avoid conflict between instances) logger.info( "%s index not found at %s, generating " "a local temporary index.", aligner, genome) sp.run(index_cmd, stderr=sp.PIPE) # Check for spaces in fasta headers and issue error if found for record in SeqIO.parse(fasta, "fasta"): if " " in record.id: logger.error( "Sequence identifiers contain spaces. Please clean the input genome." ) # Define output file names (tsv files) if prefix: fragments_list = _out_file("frags.tsv") info_contigs = _out_file("chr.tsv") mat = _out_file("mat.tsv") # If matrix has a different format, give it the right extension if mat_fmt != "graal": mat = _out_file(mat_fmt) else: # Default graal file names fragments_list = _out_file("fragments_list.txt") info_contigs = _out_file("info_contigs.txt") mat = _out_file("abs_fragments_contacts_weighted.txt") if mat_fmt != "graal": mat = _out_file("abs_fragments_contacts_weighted." + mat_fmt) # Define what input files are given if start_stage == 0: reads1, reads2 = input1, input2 elif start_stage == 1: bam1, bam2 = input1, input2 elif start_stage == 2: pairs = input1 elif start_stage == 3: pairs_idx = input1 # Detect if multiple enzymes are given if re.search(",", enzyme): enzyme = enzyme.split(",") # Perform genome alignment if start_stage == 0: align_reads( reads1, genome, bam1, tmp_dir=tmp_dir, threads=threads, aligner=aligner, iterative=iterative, min_qual=min_qual, read_len=read_len, ) align_reads( reads2, genome, bam2, tmp_dir=tmp_dir, threads=threads, aligner=aligner, iterative=iterative, min_qual=min_qual, read_len=read_len, ) # Starting from bam files if start_stage <= 1: fragments_updated = True # Generate info_contigs and fragments_list output files hcd.write_frag_info( fasta, enzyme, min_size=min_size, circular=circular, output_contigs=info_contigs, output_frags=fragments_list, ) # Log fragment size distribution hcd.frag_len(frags_file_name=fragments_list, plot=plot, fig_path=frag_plot) # Make pairs file (readID, chr1, chr2, pos1, pos2, strand1, strand2) bam2pairs(bam1, bam2, pairs, info_contigs, min_qual=min_qual) # Starting from pairs file if start_stage <= 2: restrict_table = {} for record in SeqIO.parse(fasta, "fasta"): # Get chromosome restriction table restrict_table[record.id] = hcd.get_restriction_table( record.seq, enzyme, circular=circular) # Add fragment index to pairs (readID, chr1, pos1, chr2, # pos2, strand1, strand2, frag1, frag2) hcd.attribute_fragments(pairs, pairs_idx, restrict_table) # Sort pairs file by coordinates for next steps hio.sort_pairs( pairs_idx, pairs_idx + ".sorted", keys=["chr1", "pos1", "chr2", "pos2"], threads=threads, tmp_dir=tmp_dir, ) os.rename(pairs_idx + ".sorted", pairs_idx) if filter_events: uncut_thr, loop_thr = hcf.get_thresholds(pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix) hcf.filter_events( pairs_idx, pairs_filtered, uncut_thr, loop_thr, plot_events=plot, fig_path=pie_plot, prefix=prefix, ) use_pairs = pairs_filtered else: use_pairs = pairs_idx # Generate fragments file if it has not been already if not fragments_updated: hcd.write_frag_info( fasta, enzyme, min_size=min_size, circular=circular, output_contigs=info_contigs, output_frags=fragments_list, ) # Generate distance law table if enabled if distance_law: out_distance_law = _out_file("distance_law.txt") if remove_centros is None: remove_centros = 0 remove_centros = int(remove_centros) x_s, p_s, _ = hcdl.get_distance_law( pairs_idx, fragments_list, centro_file=centromeres, base=1.1, out_file=out_distance_law, circular=circular, rm_centro=remove_centros, ) # Generate distance law figure is plots are enabled if plot: # Retrieve chrom labels from distance law file _, _, chr_labels = hcdl.import_distance_law(out_distance_law) chr_labels = [lab[0] for lab in chr_labels] chr_labels_idx = np.unique(chr_labels, return_index=True)[1] chr_labels = [ chr_labels[index] for index in sorted(chr_labels_idx) ] p_s = hcdl.normalize_distance_law(x_s, p_s) hcdl.plot_ps_slope(x_s, p_s, labels=chr_labels, fig_path=distance_law_plot) # Filter out PCR duplicates if requested if pcr_duplicates: filter_pcr_dup(use_pairs, pairs_pcr) use_pairs = pairs_pcr # Build matrix from pairs. if mat_fmt == "cool": # Name matrix file in .cool cool_file = os.path.splitext(mat)[0] + ".cool" pairs2cool(use_pairs, cool_file, fragments_list) else: pairs2matrix( use_pairs, mat, fragments_list, mat_fmt=mat_fmt, threads=threads, tmp_dir=tmp_dir, ) # Clean temporary files if not no_cleanup: tempfiles = [ pairs, pairs_idx, pairs_filtered, bam1, bam2, pairs_pcr, tmp_genome, ] # Do not delete files that were given as input try: tempfiles.remove(input1) tempfiles.remove(input2) except ValueError: pass for file in tempfiles: try: os.remove(file) except FileNotFoundError: pass end_time = datetime.now() duration = relativedelta(end_time, start_time) logger.info("Contact map generated after {h}h {m}m {s}s".format( h=duration.hours, m=duration.minutes, s=duration.seconds))
def pairs2matrix(pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None): """Generate the matrix by counting the number of occurences of each combination of restriction fragments in a pairs file. Parameters ---------- pairs_file : str Path to a Hi-C pairs file, with frag1 and frag2 columns added. mat_file : str Path where the matrix will be written. fragments_file : str Path to the fragments_list.txt file. Used to know total matrix size in case some observations are not observed at the end. mat_fmt : str The format to use when writing the matrix. Can be graal or bg2 format. threads : int Number of threads to use in parallel. tmp_dir : str Temporary directory for sorting files. If None given, will use the system default. """ # Number of fragments is N lines in frag list - 1 for the header n_frags = sum(1 for line in open(fragments_file, "r")) - 1 frags = pd.read_csv(fragments_file, delimiter="\t") def write_mat_entry(frag1, frag2, contacts): """Write a single sparse matrix entry in either graal or bg2 format""" if mat_fmt == "graal": mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n") elif mat_fmt == "bg2": frag1, frag2 = int(frag1), int(frag2) mat.write("\t".join( map( str, [ frags.chrom[frag1], frags.start_pos[frag1], frags.end_pos[frag1], frags.chrom[frag2], frags.start_pos[frag2], frags.end_pos[frag2], contacts, ], )) + "\n") pre_mat_file = mat_file + ".pre.pairs" hio.sort_pairs( pairs_file, pre_mat_file, keys=["frag1", "frag2"], threads=threads, tmp_dir=tmp_dir, ) header_size = len(hio.get_pairs_header(pre_mat_file)) with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat: # Skip header lines for _ in range(header_size): next(pairs) prev_pair = ["0", "0"] # Pairs identified by [frag1, frag2] n_occ = 0 # Number of occurences of each frag combination n_nonzero = 0 # Total number of nonzero matrix entries n_pairs = 0 # Total number of pairs entered in the matrix pairs_reader = csv.reader(pairs, delimiter="\t") # First line contains nrows, ncols and number of nonzero entries. # Number of nonzero entries is unknown for now if mat_fmt == "graal": mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n") for pair in pairs_reader: # Fragment ids are field 8 and 9 curr_pair = [pair[7], pair[8]] # Increment number of occurences for fragment pair if prev_pair == curr_pair: n_occ += 1 # Write previous pair and start a new one else: if n_occ > 0: write_mat_entry(prev_pair[0], prev_pair[1], n_occ) prev_pair = curr_pair n_pairs += n_occ n_occ = 1 n_nonzero += 1 # Write the last value write_mat_entry(curr_pair[0], curr_pair[1], n_occ) n_nonzero += 1 n_pairs += 1 # Edit header line to fill number of nonzero entries inplace in graal header if mat_fmt == "graal": with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat: header = mat.readline() header = header.replace("-", str(n_nonzero)) tmp_mat.write(header) st.copyfileobj(mat, tmp_mat) # Replace the matrix file with the one with corrected header os.rename(pre_mat_file, mat_file) else: os.remove(pre_mat_file) logger.info( "%d pairs used to build a contact map of %d bins with %d nonzero entries.", n_pairs, n_frags, n_nonzero, )
def generate_log_header(log_path, input1, input2, genome, enzyme): hcl.set_file_handler(log_path, formatter=logging.Formatter("")) logger.info("## hicstuff: v%s log file", __version__) logger.info("## date: %s", time.strftime("%Y-%m-%d %H:%M:%S")) logger.info("## enzyme: %s", str(enzyme)) logger.info("## input1: %s ", input1) logger.info("## input2: %s", input2) logger.info("## ref: %s", genome) logger.info("---") hcl.set_file_handler(log_path, formatter=hcl.logfile_formatter)
def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30): """ Make a .pairs file from two Hi-C bam files sorted by read names. The Hi-C mates are matched by read identifier. Pairs where at least one reads maps with MAPQ below min_qual threshold are discarded. Pairs are sorted by readID and stored in upper triangle (first pair higher). Parameters ---------- bam1 : str Path to the name-sorted BAM file with aligned Hi-C forward reads. bam2 : str Path to the name-sorted BAM file with aligned Hi-C reverse reads. out_pairs : str Path to the output space-separated .pairs file with columns readID, chr1 pos1 chr2 pos2 strand1 strand2 info_contigs : str Path to the info contigs file, to get info on chromosome sizes and order. min_qual : int Minimum mapping quality required to keep a Hi-C pair. """ forward = ps.AlignmentFile(bam1, "rb") reverse = ps.AlignmentFile(bam2, "rb") # Generate header lines format_version = "## pairs format v1.0\n" sorting = "#sorted: readID\n" cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n" # Chromosome order will be identical in info_contigs and pair files chroms = pd.read_csv(info_contigs, sep="\t").apply(lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1) with open(out_pairs, "w") as pairs: pairs.writelines([format_version, sorting, cols] + chroms.tolist()) pairs_writer = csv.writer(pairs, delimiter="\t") n_reads = {"total": 0, "mapped": 0} # Remember if some read IDs were missing from either file unmatched_reads = 0 # Remember if all reads in one bam file have been read exhausted = [False, False] # Iterate on both BAM simultaneously for end1, end2 in itertools.zip_longest(forward, reverse): # Both file still have reads # Check if reads pass filter try: end1_passed = end1.mapping_quality >= min_qual # Happens if end1 bam file has been exhausted except AttributeError: exhausted[0] = True end1_passed = False try: end2_passed = end2.mapping_quality >= min_qual # Happens if end2 bam file has been exhausted except AttributeError: exhausted[1] = True end2_passed = False # Skip read if mate is not present until they match or reads # have been exhausted while sum(exhausted) == 0 and end1.query_name != end2.query_name: # Get next read and check filters again # Count single-read iteration unmatched_reads += 1 n_reads["total"] += 1 if end1.query_name < end2.query_name: try: end1 = next(forward) end1_passed = end1.mapping_quality >= min_qual # If EOF is reached in BAM 1 except (StopIteration, AttributeError): exhausted[0] = True end1_passed = False n_reads["mapped"] += end1_passed elif end1.query_name > end2.query_name: try: end2 = next(reverse) end2_passed = end2.mapping_quality >= min_qual # If EOF is reached in BAM 2 except (StopIteration, AttributeError): exhausted[1] = True end2_passed = False n_reads["mapped"] += end2_passed # 2 reads processed per iteration, unless one file is exhausted n_reads["total"] += 2 - sum(exhausted) n_reads["mapped"] += sum([end1_passed, end2_passed]) # Keep only pairs where both reads have good quality if end1_passed and end2_passed: # Flipping to get upper triangle if (end1.reference_id == end2.reference_id and end1.reference_start > end2.reference_start ) or end1.reference_id > end2.reference_id: end1, end2 = end2, end1 pairs_writer.writerow([ end1.query_name, end1.reference_name, end1.reference_start + 1, end2.reference_name, end2.reference_start + 1, "-" if end1.is_reverse else "+", "-" if end2.is_reverse else "+", ]) pairs.close() if unmatched_reads > 0: logger.warning( "%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.", unmatched_reads, ) logger.info( "{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})" .format( total=n_reads["total"], mapped=n_reads["mapped"], perc_map=round(100 * n_reads["mapped"] / n_reads["total"]), qual=min_qual, ))
def iterative_align( fq_in, tmp_dir, ref, n_cpu, bam_out, aligner="bowtie2", min_len=20, min_qual=30, read_len=None, ): """Iterative alignment Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are truncated to the 20 first nucleotides and unmapped reads are extended by 20 nucleotides and realigned on each iteration. Parameters ---------- fq_in : str Path to input fastq file to align iteratively. tmp_dir : str Path where temporary files should be written. ref : str Path to the reference genome if Minimap2 is used for alignment. Path to the index genome if Bowtie2/bwa is used for alignment. n_cpu : int The number of CPUs to use for the iterative alignment. bam_out : str Path where the final alignment should be written in BAM format. aligner : str Choose between minimap2, bwa or bowtie2 for the alignment. min_len : int The initial length of the fragments to align. min_qual : int Minimum mapping quality required to keep Hi-C pairs. read_len : int Read length in the fasta file. If set to None, the length of the first read is used. Set this value to the longest read length in the file if you have different read lengths. Examples -------- iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2") iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2") """ # set with the name of the unaligned reads : remaining_reads = set() total_reads = 0 # Store path of SAM containing aligned reads at each iteration. iter_out = [] # If there is already a file with the same name as the output file, # remove it. Otherwise, ignore. with contextlib.suppress(FileNotFoundError): try: os.remove(bam_out) except IsADirectoryError: logger.error("You need to give the BAM output file, not a folder.") raise # Bowtie only accepts uncompressed fastq: uncompress it into a temp file if aligner == "bowtie2" and hio.is_compressed(fq_in): uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp") with hio.read_compressed(fq_in) as inf: with open(uncomp_path, "w") as uncomp: st.copyfileobj(inf, uncomp) else: uncomp_path = fq_in # throw error if index does not exist index = hio.check_fasta_index(ref, mode=aligner) if index is None: logger.error( "Reference index is missing, please build the {} ".format(aligner), "index first.") sys.exit(1) # Counting reads with hio.read_compressed(uncomp_path) as inf: for _ in inf: total_reads += 1 total_reads /= 4 # Use first read to guess read length if not provided. if read_len is None: with hio.read_compressed(uncomp_path) as inf: # Skip first line (read header) size = inf.readline() # Stripping newline from sequence line. read_len = len(inf.readline().rstrip()) # initial length of the fragments to align # In case reads are shorter than provided min_len if read_len > min_len: n = min_len else: logger.warning( "min_len is longer than the reads. Iterative mapping will have no effect." ) n = read_len logger.info("{0} reads to parse".format(int(total_reads))) first_round = True # iterative alignment per se while n <= read_len: logger.info( "Truncating unaligned reads to {size}bp and mapping{again}.". format(size=int(n), again="" if first_round else " again")) iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] # Generate a temporary input fastq file with the n first nucleotids # of the reads. truncated_reads = truncate_reads(tmp_dir, uncomp_path, remaining_reads, n, first_round) # Align the truncated reads on reference genome temp_alignment = join(tmp_dir, "temp_alignment.bam") map_args = { "fa": ref, "cpus": n_cpu, "fq": truncated_reads, "idx": index, "bam": temp_alignment, } if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE): cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args) elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE): cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args) elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE): cmd = ("bowtie2 -x {idx} -p {cpus}" " --quiet --very-sensitive {fq}").format(**map_args) else: raise ValueError( "Unknown aligner. Select bowtie2, minimap2 or bwa.") map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) sort_process = sp.Popen( "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args), shell=True, stdin=map_process.stdout, ) out, err = sort_process.communicate() # filter the reads: the reads whose truncated end was aligned are written # to the output file. # The reads whose truncated end was not aligned are kept for the next round. remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) n += 20 first_round = False # one last round without trimming logger.info("Trying to map unaligned reads at full length ({0}bp).".format( int(read_len))) truncated_reads = truncate_reads( tmp_dir, infile=uncomp_path, unaligned_set=remaining_reads, trunc_len=n, first_round=first_round, ) if aligner == "minimap2" or aligner == "Minimap2": cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format( fa=ref, cpus=n_cpu, fq=truncated_reads) elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA": cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format(idx=index, cpus=n_cpu, fq=truncated_reads) else: cmd = ("bowtie2 -x {idx} -p {cpus} --quiet " "--very-sensitive {fq}").format(idx=index, cpus=n_cpu, fq=truncated_reads) map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) # Keep reads sorted by name sort_process = sp.Popen( "samtools sort -n -@ {cpus} -O BAM -o {bam}".format( cpus=n_cpu, bam=temp_alignment), shell=True, stdin=map_process.stdout, ) out, err = sort_process.communicate() iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) # Report unaligned reads as well iter_out += [join(tmp_dir, "unaligned.bam")] temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam) for r in temp_bam: # Do not write supplementary alignments (keeping 1 alignment/read) if r.query_name in remaining_reads and not r.is_supplementary: unmapped.write(r) unmapped.close() temp_bam.close() # Merge all aligned reads and unmapped reads into a single bam ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out) logger.info("{0} reads aligned / {1} total reads.".format( int(total_reads - len(remaining_reads)), int(total_reads))) return 0
def filter_events( in_dat, out_filtered, thr_uncut, thr_loop, plot_events=False, fig_path=None, prefix=None, ): """Filter events (loops, uncuts and weirds) Filter out spurious intrachromosomal Hi-C pairs from input file. +- pairs with reads closer or at the uncut threshold and -+ pairs with reads closer or at the loop thresholds are excluded from the ouput file. -- and ++ pairs with both mates on the same fragments are also discarded. All others are written. Parameters ---------- in_dat : file object File handle in read mode to the 2D BED file containing Hi-C pairs. out_filtered : file object File handle in write mode the output filtered 2D BED file. thr_uncut : int Minimum number of restriction sites between reads to keep an intrachromosomal +- pair. thr_loop : int Minimum number of restriction sites between reads to keep an intrachromosomal -+ pair. plot_events : bool If True, a plot showing the proportion of each type of event will be shown after filtering. fig_path : str Path where the figure will be saved. If None, figure is displayed interactively. prefix : str If the library has a name, it will be shown on plots. """ n_uncuts = 0 n_loops = 0 n_weirds = 0 lrange_intra = 0 lrange_inter = 0 # open the files for reading and writing with open(in_dat, "r") as pairs, open(out_filtered, "w") as filtered: for line in pairs: # iterate over each line # Copy header lines to output if line.startswith("#"): filtered.write(line) continue p = process_read_pair(line) line_to_write = ("\t".join( map( str, ( p["readID"], p["chr1"], p["pos1"], p["chr2"], p["pos2"], p["strand1"], p["strand2"], p["frag1"], p["frag2"], ), )) + "\n") if p["chr1"] == p["chr2"]: # Do not report ++ and -- pairs on the same fragment (impossible) if p["frag1"] == p["frag2"] and p["strand1"] == p["strand2"]: n_weirds += 1 elif p["nsites"] <= thr_loop and p["type"] == "-+": n_loops += 1 elif p["nsites"] <= thr_uncut and p["type"] == "+-": n_uncuts += 1 else: lrange_intra += 1 filtered.write(line_to_write) if p["chr1"] != p["chr2"]: lrange_inter += 1 filtered.write(line_to_write) if lrange_inter > 0: ratio_inter = round( 100 * lrange_inter / float(lrange_intra + lrange_inter), 2) else: ratio_inter = 0 # Log quick summary of operation results kept = lrange_intra + lrange_inter discarded = n_loops + n_uncuts + n_weirds total = kept + discarded logger.info("Proportion of inter contacts: {0}% (intra: {1}, " "inter: {2})".format(ratio_inter, lrange_intra, lrange_inter)) logger.info( "{0} pairs discarded: Loops: {1}, Uncuts: {2}, Weirds: {3}".format( discarded, n_loops, n_uncuts, n_weirds)) logger.info("{0} pairs kept ({1}%)".format( kept, round(100 * kept / (kept + discarded), 2))) # Visualize summary if requested by user if plot_events: try: # Plot: make a square figure and axes to plot a pieChart: plt.figure(2, figsize=(6, 6)) # The slices will be ordered and plotted counter-clockwise. fracs = [n_uncuts, n_loops, n_weirds, lrange_intra, lrange_inter] # Format labels to include event names and proportion labels = list( map( lambda x: (x[0] + ": %.2f%%") % (100 * x[1] / total), [ ("Uncuts", n_uncuts), ("Loops", n_loops), ("Weirds", n_weirds), ("3D intra", lrange_intra), ("3D inter", lrange_inter), ], )) colors = ["salmon", "lightskyblue", "yellow", "palegreen", "plum"] patches, _ = plt.pie(fracs, colors=colors, startangle=90) plt.legend( patches, labels, loc='upper left', bbox_to_anchor=(-0.1, 1.), ) if prefix: plt.title( "Distribution of library events in {}".format(prefix), bbox={ "facecolor": "1.0", "pad": 5 }, ) plt.text( 0.3, 1.15, "Threshold Uncuts = " + str(thr_uncut), fontdict=None, ) plt.text( 0.3, 1.05, "Threshold Loops = " + str(thr_loop), fontdict=None, ) plt.text( -1.5, -1.2, "Total number of reads = " + str(total), fontdict=None, ) plt.text( -1.5, -1.3, "Ratio inter/(intra+inter) = " + str(ratio_inter) + "%", fontdict=None, ) percentage = round( 100 * float(lrange_inter + lrange_intra) / (n_loops + n_uncuts + n_weirds + lrange_inter + lrange_intra)) plt.text( -1.5, -1.4, "Selected reads = {0}%".format(percentage), fontdict=None, ) if fig_path: plt.savefig(fig_path) else: plt.show() plt.clf() except Exception: logger.error( "Unable to show plots. Perhaps there is no Xserver running ?" "(might be due to windows environment) skipping figure " "generation.")
def get_thresholds(in_dat, interactive=False, plot_events=False, fig_path=None, prefix=None): """Guess distance threshold for event filtering Analyse the events in the first million of Hi-C pairs in the library, plot the occurrences of each event type according to number of restriction fragments, and ask user interactively for the minimum threshold for uncuts and loops. Parameters ---------- in_dat: str Path to the .pairs file containing Hi-C pairs. interactive: bool If True, plots are diplayed and thresholds are required interactively. plot_events : bool Whether to show the plot fig_path : str Path where the figure will be saved. If None, the figure will be diplayed interactively. prefix : str If the library has a name, it will be shown on plots. Returns ------- dictionary dictionary with keys "uncuts" and "loops" where the values are the corresponding thresholds entered by the user. """ thr_uncut = None thr_loop = None max_sites = 50 # Map of event -> legend name of event for intrachromosomal pairs. legend = { "++": "++ (weird)", "--": "-- (weird)", "+-": "+- (uncuts)", "-+": "-+ (loops)", } colors = {"++": "#222222", "+-": "r", "--": "#666666", "-+": "tab:orange"} n_events = {event: np.zeros(max_sites) for event in legend} i = 0 # open the file for reading (just the first 1 000 000 lines) with open(in_dat, "r") as pairs: for line in pairs: # Skip header lines if line.startswith("#"): continue i += 1 # Only use the first million pairs to estimate thresholds if i == 1000000: break # Process Hi-C pair into a dictionary p = process_read_pair(line) # Type of event and number of restriction site between reads etype = p["type"] nsites = p["nsites"] # Count number of events for intrachrom pairs if etype != "inter" and nsites < max_sites: n_events[etype][nsites] += 1 def plot_event(n_events, legend, name): """Plot the frequency of a given event types over distance.""" plt.xlim([-0.5, 15]) plt.plot( range(n_events[name].shape[0]), n_events[name], "o-", label=legend[name], linewidth=2.0, c=colors[name], ) if interactive: # PLot: try: plt.figure(0) for event in legend: plot_event(n_events, legend, event) plt.grid() plt.xlabel("Number of restriction fragment(s)") plt.ylabel("Number of events") plt.yscale("log") plt.legend() plt.show(block=False) except Exception: logger.error( "Unable to show plots, skipping figure generation. Perhaps " "there is no Xserver running ? (might be due to windows " "environment). Try running without the interactive option.") # Asks the user for appropriate thresholds print( "Please enter the number of restriction fragments separating " "reads in a Hi-C pair below or at which loops and " "uncuts events will be excluded\n", file=sys.stderr, ) thr_uncut = int(input("Enter threshold for the uncuts events (+-):")) thr_loop = int(input("Enter threshold for the loops events (-+):")) try: plt.clf() except Exception: pass else: # Estimate thresholds from data for event in n_events: fixed = n_events[event] fixed[fixed == 0] = 1 n_events[event] = fixed all_events = np.log(np.array(list(n_events.values()))) # Compute median occurences at each restriction sites event_med = np.median(all_events, axis=0) # Compute MAD, to have a robust estimator of the expected deviation # from median at long distances mad = np.median(abs(all_events - event_med)) exp_stdev = mad / 0.67449 # Iterate over sites, from furthest to frag+2 for site in range(max_sites)[:1:-1]: # For uncuts and loops, keep the last (closest) site where the # deviation from other events <= expected_stdev if (abs(np.log(n_events["+-"][site]) - event_med[site]) <= exp_stdev): thr_uncut = site if (abs(np.log(n_events["-+"][site]) - event_med[site]) <= exp_stdev): thr_loop = site if thr_uncut is None or thr_loop is None: raise ValueError( "The threshold for loops or uncut could not be estimated. " "Please try running with -i to investigate the problem.") logger.info("Filtering with thresholds: uncuts={0} loops={1}".format( thr_uncut, thr_loop)) if plot_events: try: plt.figure(1) plt.xlim([-0.5, 15]) # Draw colored lines for events to discard plt.plot( range(0, thr_uncut + 1), n_events["+-"][:thr_uncut + 1], "o-", c=colors["+-"], label=legend["+-"], ) plt.plot( range(0, thr_loop + 1), n_events["-+"][:thr_loop + 1], "o-", c=colors["-+"], label=legend["-+"], ) plt.plot( range(0, 2), n_events["--"][:2], "o-", c=colors["--"], label=legend["--"], ) plt.plot( range(0, 2), n_events["++"][:2], "o-", c=colors["++"], label=legend["++"], ) # Draw black lines for events to keep plt.plot( range(thr_uncut, n_events["+-"].shape[0]), n_events["+-"][thr_uncut:], "o-", range(thr_loop, n_events["-+"].shape[0]), n_events["-+"][thr_loop:], "o-", range(1, n_events["--"].shape[0]), n_events["--"][1:], "o-", range(1, n_events["++"].shape[0]), n_events["++"][1:], "o-", label="kept", linewidth=2.0, c="g", ) plt.grid() plt.xlabel("Number of restriction site(s)") plt.ylabel("Number of events") plt.yscale("log") # Remove duplicate "kept" entries in legend handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys()) # Show uncut and loop threshold as vertical lines plt.axvline(x=thr_loop, color=colors["-+"]) plt.axvline(x=thr_uncut, color=colors["+-"]) if prefix: plt.title( "Library events by distance in {}".format(prefix)) plt.tight_layout() if fig_path: plt.savefig(fig_path) else: plt.show(block=False) # plt.clf() except Exception: logger.error( "Unable to show plots, skipping figure generation. Is " "an X server running? (might be due to windows " "environment). Try running without the plot option.") return thr_uncut, thr_loop