Ejemplo n.º 1
0
def filter_pcr_dup(pairs_idx_file, filtered_file):
    """
    Filter out PCR duplicates from a coordinate-sorted pairs file using
    overrrepresented exact coordinates. If multiple fragments have two reads
    with the exact same coordinates, only one of those fragments is kept.
    Parameters
    ----------
    pairs_idx_file : str
        Path to an indexed pairs file containing the Hi-C reads.
    filtered_file : str
        Path to the output pairs file after removing duplicates.
    """
    # Keep count of how many reads are filtered
    filter_count = 0
    reads_count = 0
    # Store header lines
    header = hio.get_pairs_header(pairs_idx_file)
    with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered:
        # Copy header lines to filtered file
        for head_line in header:
            filtered.write(head_line + "\n")
            next(pairs)

        # Use csv methods to easily access columns
        paircols = [
            "readID",
            "chr1",
            "pos1",
            "chr2",
            "pos2",
            "strand1",
            "strand2",
            "frag1",
            "frag2",
        ]
        # Columns used for comparison of coordinates
        coord_cols = [col for col in paircols if col != "readID"]
        pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols)
        filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols)

        # Initialise a variable to store coordinates of reads in previous pair
        prev = {k: 0 for k in paircols}
        for pair in pair_reader:
            reads_count += 1
            # If coordinates are the same as before, skip pair
            if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols):
                filter_count += 1
                continue
            # Else write pair and store new coordinates as previous
            else:
                filt_writer.writerow(pair)
                prev = pair
        logger.info(
            "%d%% PCR duplicates have been filtered out (%d / %d pairs) "
            % (100 * round(filter_count / reads_count, 3), filter_count, reads_count)
        )
Ejemplo n.º 2
0
def get_distance_law(
    pairs_reads_file,
    fragments_file,
    centro_file=None,
    base=1.1,
    out_file=None,
    circular=False,
    rm_centro=0,
):
    """Compute distance law as a function of the genomic coordinate aka P(s).
    Bin length increases exponentially with distance. Works on pairs file 
    format from 4D Nucleome Omics Data Standards Working Group. If the genome 
    is composed of several chromosomes and you want to compute the arms 
    separately, provide a file with the positions of centromers. Create a file 
    with three coulumns separated by a tabulation. The first column contains 
    the xs, the second the ps and the third the name of the arm or chromosome. 
    The file is create in the directory given in outdir or in the current 
    directory if no directory given.

    Parameters
    ----------
    pairs_reads_file : string
        Path of a pairs file format from 4D Nucleome Omics Data Standards 
        Working Group with the 8th and 9th coulumns are the ID of the fragments
        of the reads 1 and 2.
    fragments_file : path
        Path of a table containing in the first column the ID of the fragment,
        in the second the names of the chromosome in the third and fourth 
        the start position and the end position of the fragment. The file have 
        no header. (File like the 'fragments_list.txt' from hicstuff)
    centro_file : None or str
        None or path to a file with the genomic positions of the centromers 
        sorted as the chromosomes separated by a space. The file have only one 
        line.
    base : float
        Base use to construct the logspace of the bins - 1.1 by default.
    out_file : None or str
        Path of the output file. If no path given, the output is returned.
    circular : bool
        If True, calculate the distance as the chromosome is circular. Default 
        value is False. Cannot be True if centro_file is not None
    rm_centro : int
        If a value is given, will remove the contacts close the centromeres.
        It will remove as many kb as the argument given. Default is None.

    Returns
    -------
    xs : list of numpy.ndarray
        Basepair coordinates of log bins used to compute distance law.
    ps : list of numpy.ndarray
        Contacts value, in arbitrary units, at increasingly long genomic ranges
        given by xs.
    names : list of strings
        Names of chromosomes that are plotted
    """
    # Sanity check : centro_fileition should be None if chromosomes are
    # circulars (no centromeres is circular chromosomes).
    if circular and centro_file != None:
        logger.error("Chromosomes cannot have a centromere and be circular")
        sys.exit(1)
    # Import third columns of fragments file
    fragments = pd.read_csv(fragments_file,
                            sep="\t",
                            header=0,
                            usecols=[0, 1, 2, 3])
    # Calculate the indice of the bins to separate into chromosomes/arms
    chr_segment_bins = get_chr_segment_bins_index(fragments, centro_file,
                                                  rm_centro)
    # Calculate the length of each chromosoms/arms
    chr_segment_length = get_chr_segment_length(fragments, chr_segment_bins)
    xs = logbins_xs(fragments, chr_segment_length, base, circular)
    # Create the list of p(s) with one array for each chromosome/arm and each
    # array contain as many values as in the logbin
    ps = [None] * len(chr_segment_length)
    for i in range(len(xs)):
        ps[i] = [0] * len(xs[i])
    # Read the pair reads file
    with open(pairs_reads_file, "r", newline="") as reads:
        # Remove the line of the header
        header_length = len(hio.get_pairs_header(pairs_reads_file))
        for i in range(header_length):
            next(reads)
        # Reads all the others lines and put the values in a dictionnary with
        # the keys : 'readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1',
        # 'strand2', 'frag1', 'frag2'
        reader = csv.DictReader(
            reads,
            fieldnames=[
                "readID",
                "chr1",
                "pos1",
                "chr2",
                "pos2",
                "strand1",
                "strand2",
                "frag1",
                "frag2",
            ],
            delimiter="\t",
        )
        for line in reader:
            # Iterate in each line of the file after the header
            get_pairs_distance(line, fragments, chr_segment_bins,
                               chr_segment_length, xs, ps, circular)
    # Divide the number of contacts by the area of the logbin
    for i in range(len(xs)):
        n = chr_segment_length[i]
        for j in range(len(xs[i]) - 1):
            # Use the area of a trapezium to know the area of the logbin with n
            # the size of the matrix.
            ps[i][j] /= ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) * (
                (1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j]))
            # print(
            #    ((2 * n - xs[i][j + 1] - xs[i][j]) / 2)
            #    * ((1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j]))
            # )
        # Case of the last logbin which is an isosceles rectangle triangle
        # print(ps[i][-5:-1], ((n - xs[i][-1]) ** 2) / 2)
        ps[i][-1] /= ((n - xs[i][-1])**2) / 2
    names = get_names(fragments, chr_segment_bins)
    if out_file:
        export_distance_law(xs, ps, names, out_file)
    return xs, ps, names
Ejemplo n.º 3
0
def chrom_contact_to_bedgraph(pairs_file,
                              bg_file,
                              focus_chrom,
                              binsize=5000,
                              threads=1,
                              buffer="2G"):
    """
    Compute 1D coverage track of contacts with a given chromosome from a pairs
    file and writes it to a bed file.

    Parameters
    ----------
    pairs_file : str
        Path to the input pairs file. It is tab separated with header lines
        starting by "#" and contains 7 columns: readID,chr1,pos1,chr2,pos2,
        strand1,strand2. Assumes the pairs is an upper triangle (chr1,pos1 is
        always smaller than chr2,pos2).
    bg_file : str
        Path to the output bedgraph file. It is tab separated and contains 4
        columns: chrom,start,end,value.
    focus_chrom : str
        Name of the chromosome with which contacts should be computed.
    bin_size : int
        Size of bins in which to compute contacts, in basepairs.
    
    Examples
    --------
    chrom_contact_to_bedgraph('libxyz.pairs', 'libxyz.bedgraph' 'HBV')
    """
    # Sort pairs file by coordinate and check how many lines are in header row
    hio.sort_pairs(
        pairs_file,
        pairs_file + ".sorted",
        ["chr1", "pos1", "chr2", "pos2"],
        threads=threads,
        buffer=buffer,
    )
    header = hio.get_pairs_header(pairs_file + ".sorted")
    # Create bedgraph bins for each chromosome
    bg_bins = OrderedDict()
    for line in header:
        if line.startswith("#chromsize"):
            chrom = line.split()[1:]
            # Initialize empty bins for each chromosome (at least 1)
            bg_bins[chrom[0]] = np.zeros(max(int(chrom[1]) // binsize, 1))

    # bg_bins should look like that: {'chr1': [0, 0, 0, ...], 'chr2':...}
    # Where each array has as many 0's as there are bins in the chrom.

    with open(pairs_file + ".sorted", "r") as pairs:
        # Ignore header lines
        for _ in range(len(header)):
            next(pairs)
        # Define reader object for input pairs file.
        pairs_cols = [
            "readID", "chr1", "pos1", "chr2", "pos2", "strand1", "strand2"
        ]
        pairs_reader = csv.DictReader(pairs,
                                      delimiter="\t",
                                      fieldnames=pairs_cols)

        # Read and record pair contacts with focus chromosome
        for pair in pairs_reader:
            try:
                # If chromosome of interest is first mate, increment second mate
                # bin coverage by 1
                if pair["chr1"] == focus_chrom:
                    bg_bins[pair["chr2"]][int(pair["pos2"]) // binsize] += 1
                # If it is second mate, increment first mate bin coverage
                elif pair["chr2"] == focus_chrom:
                    bg_bins[pair["chr1"]][int(pair["pos1"]) // binsize] += 1
            except IndexError:
                print(pair)

    # Write contact value as a 1D bedgraph file
    with open(bg_file, "w") as bg:
        for chrom in bg_bins:
            for bin_idx, bin_contacts in enumerate(bg_bins[chrom]):
                # Write single line as "chrom start end contacts"
                bg.write("{0}\t{1}\t{2}\t{3}\n".format(chrom,
                                                       bin_idx * binsize,
                                                       (bin_idx + 1) * binsize,
                                                       bin_contacts))
Ejemplo n.º 4
0
def attribute_fragments(pairs_file, idx_pairs_file, restriction_table):
    """
    Writes the indexed pairs file, which has two more columns than the input
    pairs file corresponding to the restriction fragment index of each read.
    Note that pairs files have 1bp point positions whereas restriction table
    has 0bp point poisitions.

    Parameters
    ----------
    pairs_file: str
        Path the the input pairs file. Consists of 7 white-space separated
        columns: readID, chr1, pos1, chr2, pos2, strand1, strand2
    idx_pairs_file: str
        Path to the output indexed pairs file. Consists of 9 white space
        separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2,
        frag1, frag2. frag1 and frag2 are 0-based restriction fragments based
        on whole genome.
    restriction_table: dict
        Dictionary with chromosome identifiers (str) as keys and list of
        positions (int) of restriction sites as values.
    """

    # NOTE: Bottlenecks here are 1. binary search in find_frag and 2. writerow
    # 1. could be reduced by searching groups of N frags in parallel and 2. by
    # writing N frags simultaneously using a single call of writerows.

    # Parse and update header section
    pairs_header = hio.get_pairs_header(pairs_file)
    header_size = len(pairs_header)
    chrom_order = []
    with open(idx_pairs_file, "w") as idx_pairs:
        for line in pairs_header:
            # Add new column names to header
            if line.startswith("#columns"):
                line = line.rstrip() + " frag1 frag2"
            if line.startswith("#chromsize"):
                chrom_order.append(line.split()[1])
            idx_pairs.write(line + "\n")

    # Get number of fragments per chrom to allow genome-based indices
    shift_frags = {}
    prev_frags = 0
    for rank, chrom in enumerate(chrom_order):
        if rank > 0:
            # Note the "-1" because there are nfrags + 1 sites in rest table
            prev_frags += len(restriction_table[chrom_order[rank - 1]]) - 1
        # Idx of each chrom's frags will be shifted by n frags in previous chroms
        shift_frags[chrom] = prev_frags

    missing_contigs = set()
    # Attribute pairs to fragments and append them to output file (after header)
    with open(pairs_file, "r") as pairs, open(idx_pairs_file,
                                              "a") as idx_pairs:
        # Skip header lines
        for _ in range(header_size):
            next(pairs)

        # Define input and output fields
        pairs_cols = [
            "readID",
            "chr1",
            "pos1",
            "chr2",
            "pos2",
            "strand1",
            "strand2",
        ]
        idx_cols = pairs_cols + ["frag1", "frag2"]

        # Use csv reader / writer to automatically parse columns into a dict
        pairs_reader = csv.DictReader(pairs,
                                      fieldnames=pairs_cols,
                                      delimiter="\t")
        pairs_writer = csv.DictWriter(idx_pairs,
                                      fieldnames=idx_cols,
                                      delimiter="\t")

        for pair in pairs_reader:
            # Get the 0-based indices of corresponding restriction fragments
            # Deducing 1 from pair position to get it into 0bp point
            pair["frag1"] = find_frag(
                int(pair["pos1"]) - 1, restriction_table[pair["chr1"]])
            pair["frag2"] = find_frag(
                int(pair["pos2"]) - 1, restriction_table[pair["chr2"]])
            # Shift fragment indices to make them genome-based instead of
            # chromosome-based
            try:
                pair["frag1"] += shift_frags[pair["chr1"]]
            except KeyError:
                missing_contigs.add(pair["chr1"])
            try:
                pair["frag2"] += shift_frags[pair["chr2"]]
            except KeyError:
                missing_contigs.add(pair["chr2"])

            # Write indexed pairs in the new file
            pairs_writer.writerow(pair)

        if missing_contigs:
            logger.warning(
                "Pairs on the following contigs were discarded as "
                "those contigs are not listed in the paris file header. "
                "This is normal if you filtered out small contigs: %s" %
                " ".join(list(missing_contigs)))
Ejemplo n.º 5
0
def pairs2matrix(pairs_file,
                 mat_file,
                 fragments_file,
                 mat_fmt="graal",
                 threads=1,
                 tmp_dir=None):
    """Generate the matrix by counting the number of occurences of each
    combination of restriction fragments in a pairs file.

    Parameters
    ----------
    pairs_file : str
        Path to a Hi-C pairs file, with frag1 and frag2 columns added.
    mat_file : str
        Path where the matrix will be written.
    fragments_file : str
        Path to the fragments_list.txt file. Used to know total
        matrix size in case some observations are not observed at the end.
    mat_fmt : str
        The format to use when writing the matrix. Can be graal or bg2 format.
    threads : int
        Number of threads to use in parallel.
    tmp_dir : str
        Temporary directory for sorting files. If None given, will use the system default.
    """
    # Number of fragments is N lines in frag list - 1 for the header
    n_frags = sum(1 for line in open(fragments_file, "r")) - 1
    frags = pd.read_csv(fragments_file, delimiter="\t")

    def write_mat_entry(frag1, frag2, contacts):
        """Write a single sparse matrix entry in either graal or bg2 format"""
        if mat_fmt == "graal":
            mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n")
        elif mat_fmt == "bg2":
            frag1, frag2 = int(frag1), int(frag2)
            mat.write("\t".join(
                map(
                    str,
                    [
                        frags.chrom[frag1],
                        frags.start_pos[frag1],
                        frags.end_pos[frag1],
                        frags.chrom[frag2],
                        frags.start_pos[frag2],
                        frags.end_pos[frag2],
                        contacts,
                    ],
                )) + "\n")

    pre_mat_file = mat_file + ".pre.pairs"
    hio.sort_pairs(
        pairs_file,
        pre_mat_file,
        keys=["frag1", "frag2"],
        threads=threads,
        tmp_dir=tmp_dir,
    )
    header_size = len(hio.get_pairs_header(pre_mat_file))
    with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat:
        # Skip header lines
        for _ in range(header_size):
            next(pairs)
        prev_pair = ["0", "0"]  # Pairs identified by [frag1, frag2]
        n_occ = 0  # Number of occurences of each frag combination
        n_nonzero = 0  # Total number of nonzero matrix entries
        n_pairs = 0  # Total number of pairs entered in the matrix
        pairs_reader = csv.reader(pairs, delimiter="\t")
        # First line contains nrows, ncols and number of nonzero entries.
        # Number of nonzero entries is unknown for now
        if mat_fmt == "graal":
            mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n")
        for pair in pairs_reader:
            # Fragment ids are field 8 and 9
            curr_pair = [pair[7], pair[8]]
            # Increment number of occurences for fragment pair
            if prev_pair == curr_pair:
                n_occ += 1
            # Write previous pair and start a new one
            else:
                if n_occ > 0:
                    write_mat_entry(prev_pair[0], prev_pair[1], n_occ)
                prev_pair = curr_pair
                n_pairs += n_occ
                n_occ = 1
                n_nonzero += 1
        # Write the last value
        write_mat_entry(curr_pair[0], curr_pair[1], n_occ)
        n_nonzero += 1
        n_pairs += 1

    # Edit header line to fill number of nonzero entries inplace in graal header
    if mat_fmt == "graal":
        with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat:
            header = mat.readline()
            header = header.replace("-", str(n_nonzero))
            tmp_mat.write(header)
            st.copyfileobj(mat, tmp_mat)
        # Replace the matrix file with the one with corrected header
        os.rename(pre_mat_file, mat_file)
    else:
        os.remove(pre_mat_file)

    logger.info(
        "%d pairs used to build a contact map of %d bins with %d nonzero entries.",
        n_pairs,
        n_frags,
        n_nonzero,
    )