Ejemplo n.º 1
0
    def wrapped(*args, **kwargs):
        try:
            import cooler

            fun.__globals__["cooler"] = cooler
        except ImportError:
            logger.error(
                "The cooler package is required to use {0}, please install it first"
                .format(fun.__name__))
            raise ImportError("The cooler package is required.")
        return fun(*args, **kwargs)
Ejemplo n.º 2
0
def average_distance_law(xs, ps, sup, big_arm_only=False):
    """Compute the average distance law between the file the different distance
    law of the chromosomes/arms.

    Parameters
    ----------
    xs : list of numpy.ndarray
        The list of logbins.
    ps : list of lists of floats
        The list of numpy.ndarray.
    sup : int
        Value given to set the minimum size of the chromosomes/arms to make the
        average.
    big_arm_only : bool
        By default False. If True, will only take into account the arms/chromosomes 
        longer than the value of sup. Sup mandatory if set.

    Returns
    -------
    numpy.ndarray :
        List of the xs with the max length.
    numpy.ndarray :
        List of the average_ps.
    """
    # Find longest chromosome / arm and make two arrays of this length for the
    # average distance law and remove the last value.
    xs = max(xs, key=len)
    max_length = len(xs)
    ps_values = np.zeros(max_length)
    ps_occur = np.zeros(max_length)
    for chrom_ps in ps:
        # Iterate on ps in order to calculate the number of occurences (all the
        # chromossomes/arms are not as long as the longest one) and the sum of
        # the values of distance law.
        # Change the last value to have something continuous because the last
        # one is much bigger.
        chrom_ps[-1] = chrom_ps[-2]
        # Sanity check : sup strictly inferior to maw length arms.
        if big_arm_only:
            if sup >= xs[-1]:
                logger.error(
                    "sup have to be inferior to the max length of arms/chromsomes if big arm only set"
                )
                sys.exit(1)
            if sup <= xs[len(chrom_ps) - 1]:
                ps_occur[:len(chrom_ps)] += 1
                ps_values[:len(chrom_ps)] += chrom_ps
        else:
            ps_occur[:len(chrom_ps)] += 1
            ps_values[:len(chrom_ps)] += chrom_ps
    # Make the mean
    averaged_ps = ps_values / ps_occur
    return xs, averaged_ps
Ejemplo n.º 3
0
def normalize_distance_law(xs, ps, inf=3000, sup=None):
    """Normalize the distance in order to have the sum of the ps values between
    'inf' (default value is 3kb) until the end of the array equal to one and
    limit the effect of coverage between two conditions/chromosomes/arms when
    you compare them together. If we have a list of ps, it will normalize until
    the length of the shorter object or the value of sup, whichever is smaller.

    Parameters
    ----------
    xs : list of numpy.ndarray
        list of logbins corresponding to the ps.
    ps : list of numpy.ndarray
        Average ps or list of ps of the chromosomes/arms. xs and ps have to 
        have the same shape.
    inf : integer
        Inferior value of the interval on which, the normalization is applied.
    sup : integer
        Superior value of the interval on which, the normalization is applied.

    Returns
    -------
    list of numpy.ndarray :
        List of ps each normalized separately.
    """
    # Sanity check: xs and ps have the same dimension
    if np.shape(xs) != np.shape(ps):
        logger.error("xs and ps should have the same dimension.")
        sys.exit(1)
    # Define the length of shortest chromosomes as a lower bound for the sup boundary
    min_xs = len(min(xs, key=len))
    normed_ps = [None] * len(ps)
    if sup is None:
        sup = np.inf
    for chrom_id, chrom_ps in enumerate(ps):
        # Iterate on the different ps to normalize each of theme separately
        chrom_sum = 0
        # Change the last value to have something continuous because the last
        # one is much bigger (computed on matrix corner = triangle instead of trapezoid).
        chrom_ps[-1] = chrom_ps[-2]
        for bin_id, bin_value in enumerate(chrom_ps):
            # Compute normalization factor based on values between inf and sup
            # Sup will be whatever is smaller between user-provided sup and length of
            # the shortest chromosome
            if (xs[chrom_id][bin_id] > inf) and (xs[chrom_id][bin_id] <
                                                 sup) and (bin_id < min_xs):
                chrom_sum += bin_value
        if chrom_sum == 0:
            chrom_sum += 1
            logger.warning("No values of p(s) in one segment")
        # Make the normalisation
        normed_ps[chrom_id] = np.array(ps[chrom_id]) / chrom_sum
    return normed_ps
Ejemplo n.º 4
0
def export_distance_law(xs, ps, names, out_dir=None):
    """ Export the x(s) and p(s) from two list of numpy.ndarrays to a table
    in txt file with three columns separated by a tabulation. The first column
    contains the x(s), the second the p(s) and the third the name of the arm or
    chromosome. The file is createin the directory given by outdir or the
    current directory if no directory given.

    Parameters
    ----------
    xs : list of numpy.ndarray
        The list of the start position of logbins of each p(s) in base pairs.
    ps : list of numpy.ndarray
        The list of p(s).
    names : list of string
        List containing the names of the chromosomes/arms/conditions of the p(s)
        values given.
    out_dir : str or None
        Path where output files should be written. Current directory by
        default.

    Return
    ------
    txt file:
         File with three coulumns separated by a tabulation. The first column
         contains the x(s), the second the p(s) and the third the name of the arm
         or chromosome. The file is createin the directory given by outdir or
         the current directory if no directory given.
    """
    # Give the current directory as out_dir if no out_dir is given.
    if out_dir is None:
        out_dir = os.getcwd() + "/distance_law.txt"
    # Sanity check: as many chromosomes/arms as ps
    if len(xs) != len(names):
        logger.error("Number of chromosomes/arms and number of p(s) list differ.")
        sys.exit(1)
    # Create the file and write it
    f = open(out_dir, "w")
    for i in range(len(xs)):
        for j in range(len(xs[i])):
            line = (
                str(format(xs[i][j], "g"))
                + "\t"
                + str(format(ps[i][j], "g"))
                + "\t"
                + names[i]
                + "\n"
            )
            f.write(line)
    f.close()
Ejemplo n.º 5
0
def normalize_distance_law(xs, ps, inf=3000):
    """Normalize the distance in order to have the sum of the ps values between
    'inf' (default value is 3kb) until the end of the array equal to one and
    limit the effect of coverage between two conditions/chromosomes/arms when
    you compare them together. If we have a list of ps, it will normalize until
    the length of the shorter object.

    Parameters
    ----------
    xs : list of numpy.ndarray
        list of logbins corresponding to the ps.
    ps : list of numpy.ndarray
        Average ps or list of ps of the chromosomes/arms. xs and ps have to 
        have the same shape.
    inf : integer
        Inferior value of the intervall on which, the normalization is making.

    Returns
    -------
    list of numpy.ndarray :
        List of ps each normalized separately.
    """
    # Sanity check: xs and ps have the same dimension
    if np.shape(xs) != np.shape(ps):
        logger.error("xs and ps should have the same dimension.")
        sys.exit(1)
    # Take the min of xs as superior limit to choose the limits of the
    # interval use for the normalisation
    min_xs = len(min(xs, key=len))
    normed_ps = [None] * len(ps)
    for j, my_list in enumerate(ps):
        # Iterate on the different ps to normalize each of theme separately
        sum_values = 0
        # Change the last value to have something continuous because the last
        # one is much bigger.
        my_list[-1] = my_list[-2]
        for i, value in enumerate(my_list):
            # Keep only the value between 1kb and the length of the shorter
            # object given in the list
            if (xs[j][i] > inf) and (i < min_xs):
                sum_values += value
        if sum_values == 0:
            sum_values += 1
            logger.warning("No values of p(s) in one segment")
        # Make the normalisation
        normed_ps[j] = np.array(ps[j]) / sum_values
    return normed_ps
Ejemplo n.º 6
0
def get_distance_law(
    pairs_reads_file,
    fragments_file,
    centro_file=None,
    base=1.1,
    out_file=None,
    circular=False,
    rm_centro=0,
):
    """Compute distance law as a function of the genomic coordinate aka P(s).
    Bin length increases exponentially with distance. Works on pairs file 
    format from 4D Nucleome Omics Data Standards Working Group. If the genome 
    is composed of several chromosomes and you want to compute the arms 
    separately, provide a file with the positions of centromers. Create a file 
    with three coulumns separated by a tabulation. The first column contains 
    the xs, the second the ps and the third the name of the arm or chromosome. 
    The file is create in the directory given in outdir or in the current 
    directory if no directory given.

    Parameters
    ----------
    pairs_reads_file : string
        Path of a pairs file format from 4D Nucleome Omics Data Standards 
        Working Group with the 8th and 9th coulumns are the ID of the fragments
        of the reads 1 and 2.
    fragments_file : path
        Path of a table containing in the first column the ID of the fragment,
        in the second the names of the chromosome in the third and fourth 
        the start position and the end position of the fragment. The file have 
        no header. (File like the 'fragments_list.txt' from hicstuff)
    centro_file : None or str
        None or path to a file with the genomic positions of the centromers 
        sorted as the chromosomes separated by a space. The file have only one 
        line.
    base : float
        Base use to construct the logspace of the bins - 1.1 by default.
    out_file : None or str
        Path of the output file. If no path given, the output is returned.
    circular : bool
        If True, calculate the distance as the chromosome is circular. Default 
        value is False. Cannot be True if centro_file is not None
    rm_centro : int
        If a value is given, will remove the contacts close the centromeres.
        It will remove as many kb as the argument given. Default is None.

    Returns
    -------
    xs : list of numpy.ndarray
        Basepair coordinates of log bins used to compute distance law.
    ps : list of numpy.ndarray
        Contacts value, in arbitrary units, at increasingly long genomic ranges
        given by xs.
    names : list of strings
        Names of chromosomes that are plotted
    """
    # Sanity check : centro_fileition should be None if chromosomes are
    # circulars (no centromeres is circular chromosomes).
    if circular and centro_file != None:
        logger.error("Chromosomes cannot have a centromere and be circular")
        sys.exit(1)
    # Import third columns of fragments file
    fragments = pd.read_csv(fragments_file,
                            sep="\t",
                            header=0,
                            usecols=[0, 1, 2, 3])
    # Calculate the indice of the bins to separate into chromosomes/arms
    chr_segment_bins = get_chr_segment_bins_index(fragments, centro_file,
                                                  rm_centro)
    # Calculate the length of each chromosoms/arms
    chr_segment_length = get_chr_segment_length(fragments, chr_segment_bins)
    xs = logbins_xs(fragments, chr_segment_length, base, circular)
    # Create the list of p(s) with one array for each chromosome/arm and each
    # array contain as many values as in the logbin
    ps = [None] * len(chr_segment_length)
    for i in range(len(xs)):
        ps[i] = [0] * len(xs[i])
    # Read the pair reads file
    with open(pairs_reads_file, "r", newline="") as reads:
        # Remove the line of the header
        header_length = len(hio.get_pairs_header(pairs_reads_file))
        for i in range(header_length):
            next(reads)
        # Reads all the others lines and put the values in a dictionnary with
        # the keys : 'readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1',
        # 'strand2', 'frag1', 'frag2'
        reader = csv.DictReader(
            reads,
            fieldnames=[
                "readID",
                "chr1",
                "pos1",
                "chr2",
                "pos2",
                "strand1",
                "strand2",
                "frag1",
                "frag2",
            ],
            delimiter="\t",
        )
        for line in reader:
            # Iterate in each line of the file after the header
            get_pairs_distance(line, fragments, chr_segment_bins,
                               chr_segment_length, xs, ps, circular)
    # Divide the number of contacts by the area of the logbin
    for i in range(len(xs)):
        n = chr_segment_length[i]
        for j in range(len(xs[i]) - 1):
            # Use the area of a trapezium to know the area of the logbin with n
            # the size of the matrix.
            ps[i][j] /= ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) * (
                (1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j]))
            # print(
            #    ((2 * n - xs[i][j + 1] - xs[i][j]) / 2)
            #    * ((1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j]))
            # )
        # Case of the last logbin which is an isosceles rectangle triangle
        # print(ps[i][-5:-1], ((n - xs[i][-1]) ** 2) / 2)
        ps[i][-1] /= ((n - xs[i][-1])**2) / 2
    names = get_names(fragments, chr_segment_bins)
    if out_file:
        export_distance_law(xs, ps, names, out_file)
    return xs, ps, names
Ejemplo n.º 7
0
def get_pairs_distance(line,
                       fragments,
                       chr_segment_bins,
                       chr_segment_length,
                       xs,
                       ps,
                       circular=False):
    """From a line of a pair reads file, filter -/+ or +/- reads, keep only the 
    reads in the same chromosome/arm and compute the distance of the the two
    fragments. It modify the input ps in order to count or not the line given. 
    It will add one in the logbin corresponding to the distance.

    Parameters
    ----------
    line : OrderedDict
        Line of a pair reads file with the these keys readID, chr1, pos1, chr2,
        pos2, strand1, strand2, frag1, frag2. The values are in a dictionnary.
    fragments : pandas.DataFrame
        Table containing in the first coulum the ID of the fragment, in the 
        second the names of the chromosome in the third and fourth the start 
        position and the end position of the fragment. The file have no header.
        (File like the 'fragments_list.txt' from hicstuff)
    chr_segment_bins : list of floats
        The start and end indices of chromosomes/arms to compute the distance
        law on each chromosome/arm separately.
    chr_segment_length: list of floats
        List of the size in base pairs of the different arms or chromosomes.
    xs : list of lists
        The start coordinate of each bin one array per chromosome or arm.
    ps : list of lists
        The sum of contact already count. xs and ps should have the same 
        dimensions.
    circular : bool
        If True, calculate the distance as the chromosome is circular. Default 
        value is False.
    """
    # Check this is a pairs_idx file and not simple pairs
    if line['frag1'] is None:
        logger.error(
            'Input pairs file must have frag1 and frag2 columns. In hicstuff '
            'pipeline, this is the "valid_idx.pairs" file.')
    # We only keep the event +/+ or -/-. This is done to avoid to have any
    # event of uncut which are not possible in these events. We can remove the
    # good events of +/- or -/+ because we don't need a lot of reads to compute
    # the distance law and if we eliminate these reads we do not create others
    # biases as they should have the same distribution.
    if line["strand1"] == line["strand2"]:
        # Find in which chromosome/arm are the fragment 1 and 2.
        chr_bin1 = (np.searchsorted(
            chr_segment_bins, int(line["frag1"]), side="right") - 1)
        chr_bin2 = (np.searchsorted(
            chr_segment_bins, int(line["frag2"]), side="right") - 1)
        # We only keep the reads with the two fragments in the same chromosome
        # or arm.
        if chr_bin1 == chr_bin2:
            # Remove the contacts in the centromeres if centro_remove
            if chr_bin1 % 2 == 0:
                chr_bin1 = int(chr_bin1 / 2)
                # For the reads -/-, the fragments should be religated with both
                # their start position (position in the left on the genomic
                # sequence, 5'). For the reads +/+ it's the contrary. We compute
                # the distance as the distance between the two extremities which
                # are religated.
                if line["strand1"] == "-":
                    distance = abs(
                        np.array(fragments["start_pos"][int(line["frag1"])]) -
                        np.array(fragments["start_pos"][int(line["frag2"])]))
                if line["strand1"] == "+":
                    distance = abs(
                        np.array(fragments["end_pos"][int(line["frag1"])]) -
                        np.array(fragments["end_pos"][int(line["frag2"])]))
                if circular:
                    distance = circular_distance_law(distance,
                                                     chr_segment_length,
                                                     chr_bin1)
                xs_temp = xs[chr_bin1][:]
                # Find the logbins in which the distance is and add one to the sum
                # of contact.
                ps_indice = np.searchsorted(xs_temp, distance,
                                            side="right") - 1
                ps[chr_bin1][ps_indice] += 1
Ejemplo n.º 8
0
def write_frag_info(
    fasta,
    enzyme,
    min_size=DEFAULT_THRESHOLD_SIZE,
    circular=False,
    output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME,
    output_frags=DEFAULT_FRAGMENTS_LIST_FILE_NAME,
    output_dir=None,
):
    """Digest and write fragment information

    Write the fragments_list.txt and info_contigs.txt that are necessary for
    instagraal to run.

    Parameters
    ----------
    fasta : pathlib.Path or str
        The path to the reference genome
    enzyme : str, int or list of str
        If a string, must be the name of an enzyme (e.g. DpnII) and the genome
        will be cut at the enzyme's restriction sites. If a number, the genome
        will be cut uniformly into chunks with length equal to that number. A
        list of enzymes can also be specified if using multiple enzymes.
    min_size : float, optional
        Size below which shorter contigs are discarded. Default is 0, i.e. all
        contigs are retained.
    circular : bool, optional
        Whether the genome is circular. Default is False.
    output_contigs : str, optional
        The name of the file with contig info. Default is info_contigs.txt
    output_frags : str, optional
        The name of the file with fragment info. Default is fragments_list.txt
    output_dir : [type], optional
        The path to the output directory, which will be created if not already
        existing. Default is the current directory.
    """

    records = SeqIO.parse(fasta, "fasta")

    try:
        info_contigs_path = os.path.join(output_dir, output_contigs)
        frag_list_path = os.path.join(output_dir, output_frags)
    except TypeError:
        info_contigs_path = output_contigs
        frag_list_path = output_frags

    with open(info_contigs_path, "w") as info_contigs:

        info_contigs.write("contig\tlength\tn_frags\tcumul_length\n")

        with open(frag_list_path, "w") as fragments_list:

            fragments_list.write("id\tchrom\tstart_pos"
                                 "\tend_pos\tsize\tgc_content\n")

            total_frags = 0

            for record in records:
                contig_seq = record.seq
                contig_name = record.id
                contig_length = len(contig_seq)
                if contig_length < int(min_size):
                    continue

                sites = get_restriction_table(contig_seq,
                                              enzyme,
                                              circular=circular)
                fragments = (contig_seq[sites[i]:sites[i + 1]]
                             for i in range(len(sites) - 1))
                n_frags = 0

                current_id = 1
                start_pos = 0
                for frag in fragments:
                    frag_length = len(frag)
                    if frag_length > 0:
                        end_pos = start_pos + frag_length
                        gc_content = SeqUtils.GC(frag) / 100.0

                        current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % (
                            current_id,
                            contig_name,
                            start_pos,
                            end_pos,
                            frag_length,
                            gc_content,
                        )

                        fragments_list.write(current_fragment_line)

                        try:
                            assert (current_id == 1
                                    and start_pos == 0) or (current_id > 1
                                                            and start_pos > 0)
                        except AssertionError:
                            logger.error((current_id, start_pos))
                            raise
                        start_pos = end_pos
                        current_id += 1
                        n_frags += 1

                current_contig_line = "%s\t%s\t%s\t%s\n" % (
                    contig_name,
                    contig_length,
                    n_frags,
                    total_frags,
                )
                total_frags += n_frags
                info_contigs.write(current_contig_line)
Ejemplo n.º 9
0
def cut_ligation_sites(
    fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu
):
    """Create new reads to manage pairs with a digestion and create multiple
    pairs to take into account all the contact present.

    The function write two files for both the forward and reverse fastq with the
    new reads. The new reads have at the end of the ID ":[0-9]" added to
    differentiate the different pairs created from one read.

    The function will look for all the sites present and create new pairs of
    reads according to the mode given to retreive as much as possible of the HiC
    signal.

    Parameters
    ----------
    fq_for : str
        Path to the forward fastq file to digest.
    fq_rev : str
        Path to the reverse fatsq file to digest.
    digest_for : str
        Path to the output digested forward fatsq file to write.
    digest_rev : str
        Path to the output digested reverse fatsq file to write.
    enzyme : str
        The list of restriction enzyme used to digest the genome separated by a
        comma. Example: HpaII,MluCI.
    mode : str
        Mode to use to make the digestion. Three values possible: "all",
        "for_vs_rev", "pile".
    seed_size : int
        Minimum size of a fragment (i.e. seed size used in mapping as reads 
        smaller won't be mapped.)
    n_cpu : int
        Number of CPUs.
    """
    # Process the ligation sites given
    ligation_sites = hcd.gen_enzyme_religation_regex(enzyme)

    # Defined stop_token which is used to mark the end of input file
    stop_token = "STOP"
    # A stack is a string cointaining multiple read pairs
    max_stack_size = 1000

    # Create count to have an idea of the digested pairs repartition.
    original_number_of_pairs = 0
    final_number_of_pairs = 0
    new_reads_for = ""
    new_reads_rev = ""
    current_stack = 0

    # Start parallel threading to compute the
    # ctx = multiprocessing.get_context("spawn")
    queue = multiprocessing.Queue(max(1, n_cpu - 1))
    writer_process = multiprocessing.Process(
        target=_writer, args=(digest_for, digest_rev, queue, stop_token)
    )
    writer_process.start()

    # Iterate on all pairs
    for read_for, read_rev in zip(
        pyfastx.Fastq(fq_for, build_index=False),
        pyfastx.Fastq(fq_rev, build_index=False),
    ):

        # Count the numbers of original reads processed.
        original_number_of_pairs += 1

        # Count for stack size.
        current_stack += 1

        # Extract components of the reads.
        for_name, for_seq, for_qual = read_for
        rev_name, rev_seq, rev_qual = read_rev

        # Sanity check to be sure all reads are with their mate.
        if for_name != rev_name:
            logger.error(
                "The fastq files contains reads not sorted :\n{0}\n{1}".format(
                    read_for.id, read_rev.id
                )
            )
            sys.exit(1)

        # Cut the forward and reverse reads at the ligation sites.
        for_seq_list, for_qual_list = cutsite_read(
            ligation_sites, for_seq, for_qual, seed_size,
        )
        rev_seq_list, rev_qual_list = cutsite_read(
            ligation_sites, rev_seq, rev_qual, seed_size,
        )

        # Write the new combinations of fragments.
        new_reads_for, new_reads_rev, final_number_of_pairs = write_pair(
            new_reads_for,
            new_reads_rev,
            for_name,
            for_seq_list,
            for_qual_list,
            rev_seq_list,
            rev_qual_list,
            mode,
            final_number_of_pairs,
        )

        # If stack full, add it in the queue.
        if current_stack == max_stack_size:

            # Add the pair in the queue.
            pairs = (new_reads_for.encode(), new_reads_rev.encode())
            queue.put(pairs)

            # Empty the stack
            current_stack = 0
            new_reads_for = ""
            new_reads_rev = ""

    # End the parallel processing.
    pairs = (new_reads_for.encode(), new_reads_rev.encode())
    queue.put(pairs)
    queue.put(stop_token)
    writer_process.join()

    # Return information on the different pairs created
    logger.info(f"Library used: {fq_for} - {fq_rev}")
    logger.info(
        f"Number of pairs before digestion: {original_number_of_pairs}"
    )
    logger.info(
        f"Number of pairs after digestion: {final_number_of_pairs}"
    )
Ejemplo n.º 10
0
def full_pipeline(
    genome,
    input1,
    input2=None,
    aligner="bowtie2",
    centromeres=None,
    circular=False,
    distance_law=False,
    enzyme=5000,
    filter_events=False,
    force=False,
    iterative=False,
    mat_fmt="graal",
    min_qual=30,
    min_size=0,
    no_cleanup=False,
    out_dir=None,
    pcr_duplicates=False,
    plot=False,
    prefix=None,
    read_len=None,
    remove_centros=None,
    start_stage="fastq",
    threads=1,
    tmp_dir=None,
):
    """
    Run the whole hicstuff pipeline. Starting from fastq files and a genome to
    obtain a contact matrix.

    Parameters
    ----------
    genome : str
        Path to the bowtie2/bwa index prefix if using bowtie2/bwa or to the genome 
        in fasta format if using minimap2.
    input1 : str
        Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
        in BAM format, or the pairs file, depending on the value of start_stage.
    input2 : str
        Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
        in BAM format, or None, depending on the value of start_stage.
    enzyme : int or str
        Name of the enzyme used for the digestion (e.g "DpnII"). If an integer
        is used instead, the fragment attribution will be done directly using a
        fixed chunk size.
    circular : bool
        Use if the genome is circular.
    out_dir : str or None
        Path where output files should be written. Current directory by default.
    tmp_dir : str or None
        Path where temporary files will be written. Creates a "tmp" folder in
        out_dir by default.
    plot : bool
        Whether plots should be generated at different steps of the pipeline.
        Plots are saved in a "plots" directory inside out_dir.
    min_qual : int
        Minimum mapping quality required to keep a pair of Hi-C reads.
    min_size : int
        Minimum contig size required to keep it.
    threads : int
        Number of threads to use for parallel operations.
    no_cleanup : bool
        Whether temporary files should be deleted at the end of the pipeline.
    iterative : bool
        Use iterative mapping. Truncates and extends reads until unambiguous
        alignment.
    filter_events : bool
        Filter spurious or uninformative 3C events. Requires a restriction enzyme.
    force : bool
        If True, overwrite existing files with the same name as output.
    prefix : str or None
        Choose a common name for output files instead of default graal names.
    start_stage : str
        Step at which the pipeline should start. Can be "fastq", "bam", "pairs"
        or "pairs_idx". With starting from bam allows to skip alignment and start
        from named-sorted bam files. With
        "pairs", a single pairs file is given as input, and with "pairs_idx", the
        pairs in the input must already be attributed to fragments and fragment
        attribution is skipped.
    mat_fmt : str
        Select the output matrix format. Can be either "bg2" for the
        bedgraph2 format, "cool" for Mirnylab's cool format, or graal for a
        plain text COO format compatible with Koszullab's instagraal software.
    aligner : str
        Read alignment software to use. Can be either "minimap2", "bwa" or "bowtie2".
    pcr_duplicates : bool
        If True, PCR duplicates will be filtered based on genomic positions.
        Pairs where both reads have exactly the same coordinates are considered
        duplicates and only one of those will be conserved.
    distance_law : bool
        If True, generates a distance law file with the values of the probabilities
        to have a contact between two distances for each chromosomes or arms if the
        file with the positions has been given. The values are not normalized, or
        averaged.
    centromeres : None or str
        If not None, path of file with Positions of the centromeres separated by a
        space and in the same order than the chromosomes.
    read_len : int
        Maximum read length to expect in the fastq file. Optionally used in iterative
        alignment mode. Estimated from the first read by default. Useful if input fastq
        is a composite of different read lengths.
    remove_centros : None or int
        If the distance law is computed, this is the number of kb that will be removed
        around the centromere position given by in the centromere file.
    """
    # Check if third parties can be run
    if aligner in ("bowtie2", "minimap2", "bwa"):
        if check_tool(aligner) is None:
            logger.error("%s is not installed or not on PATH", aligner)
            sys.exit(1)
    else:
        logger.error(
            "Incompatible aligner software, choose bowtie2, minimap2 or bwa")
        sys.exit(1)
    if check_tool("samtools") is None:
        logger.error("Samtools is not installed or not on PATH")
        sys.exit(1)

    # Pipeline can start from 3 input types
    start_time = datetime.now()
    stages = {"fastq": 0, "bam": 1, "pairs": 2, "pairs_idx": 3}
    start_stage = stages[start_stage]

    # Check if the number of input files is correct
    if start_stage <= 1:
        if input2 is None:
            logger.error(
                "You must provide 2 input files when --start-stage is fastq "
                "or bam.")
            sys.exit(1)
    else:
        if input2 is not None:
            logger.error(
                "You must provide a single input file when --start-stage is "
                "pairs or pairs_idx.")
            sys.exit(1)
    # sanitize enzyme
    enzyme = str(enzyme)
    # Remember whether fragments_file has been generated during this run
    fragments_updated = False

    if out_dir is None:
        out_dir = os.getcwd()

    if tmp_dir is None:
        tmp_dir = join(out_dir, "tmp")

    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)

    # Define figures output paths
    if plot:
        fig_dir = join(out_dir, "plots")
        os.makedirs(fig_dir, exist_ok=True)
        frag_plot = join(fig_dir, "frags_hist.pdf")
        dist_plot = join(fig_dir, "event_distance.pdf")
        pie_plot = join(fig_dir, "event_distribution.pdf")
        distance_law_plot = join(fig_dir, "distance_law.pdf")
        matplotlib.use("Agg")
    else:
        fig_dir = None
        dist_plot = pie_plot = frag_plot = None

    # Use current time for logging and to identify files
    now = time.strftime("%Y%m%d%H%M%S")

    def _tmp_file(fname):
        if prefix:
            fname = prefix + "." + fname
        full_path = join(tmp_dir, fname)
        if not force and os.path.exists(full_path):
            raise IOError(
                "Temporary file {} already exists. Use --force to overwrite".
                format(full_path))
        return full_path

    def _out_file(fname):
        if prefix:
            fname = prefix + "." + fname
        full_path = join(out_dir, fname)
        if not force and os.path.exists(full_path):
            raise IOError(
                "Output file {} already exists. Use --force to overwrite".
                format(full_path))

        return full_path

    # Define temporary file names
    log_file = _out_file("hicstuff_" + now + ".log")
    tmp_genome = _tmp_file("genome.fasta")
    bam1 = _tmp_file("for.bam")
    bam2 = _tmp_file("rev.bam")
    pairs = _tmp_file("valid.pairs")
    pairs_idx = _tmp_file("valid_idx.pairs")
    pairs_filtered = _tmp_file("valid_idx_filtered.pairs")
    pairs_pcr = _tmp_file("valid_idx_pcrfree.pairs")

    # Enable file logging
    hcl.set_file_handler(log_file)
    generate_log_header(log_file, input1, input2, genome, enzyme)

    # If the user chose bowtie2 and supplied an index, extract fasta from it
    # For later steps of the pipeline (digestion / frag attribution)
    # Check if the genome is an index or fasta file
    idx = hio.check_fasta_index(genome, mode=aligner)
    is_fasta = hio.check_is_fasta(genome)

    # Different aligners accept different files. Make sure the input format is good.
    # Note bowtie2 can extract fasta from the index, but bwa cannot
    sane_input = {
        'bowtie2': is_fasta or idx,
        'minimap2': is_fasta,
        'bwa': is_fasta
    }

    if not sane_input[aligner]:
        logger.error(
            "You must provide either a fasta or bowtie2 index prefix as genome"
        )

    # Just use the input genome is it is indexed
    if is_fasta and idx:
        fasta = genome
    # Otherwise copy it in tmpdir for indexing, unless the input is a bt2 index, in which
    # case fasta will be extracted later from it.
    else:
        if is_fasta:
            st.copy(genome, tmp_genome)
            genome = tmp_genome
        fasta = tmp_genome

    # Bowtie2-specific feature: extract fasta from the index
    if aligner == 'bowtie2' and not is_fasta:
        # Index is present, extract fasta file from it
        bt2fa = sp.Popen(
            ["bowtie2-inspect", genome],
            stdout=open(tmp_genome, "w"),
            stderr=sp.PIPE,
        )
        _, bt2err = bt2fa.communicate()
        # bowtie2-inspect still has return code 0 when crashing, need to
        # actively look for error in stderr
        if re.search(r"[Ee]rror", bt2err.decode()):

            logger.error(bt2err)
            logger.error(
                "bowtie2-inspect has failed, make sure you provided "
                "the path to the bowtie2 index without the extension.")
            sys.exit(1)

    # Build index with bowtie2 / bwa if required
    if idx is None and aligner in ['bowtie2', 'bwa']:
        if aligner == 'bowtie2':
            index_cmd = ["bowtie2-build", '-q', fasta, fasta]
        elif aligner == 'bwa':
            index_cmd = ['bwa', 'index', fasta]
        # We only need the index if the user provided fastq input
        if start_stage == 0:
            # If no index present assume input is fasta, copy it in tmp and
            # index it (to avoid conflict between instances)
            logger.info(
                "%s index not found at %s, generating "
                "a local temporary index.", aligner, genome)
            sp.run(index_cmd, stderr=sp.PIPE)

    # Check for spaces in fasta headers and issue error if found
    for record in SeqIO.parse(fasta, "fasta"):
        if " " in record.id:
            logger.error(
                "Sequence identifiers contain spaces. Please clean the input genome."
            )
    # Define output file names (tsv files)
    if prefix:
        fragments_list = _out_file("frags.tsv")
        info_contigs = _out_file("chr.tsv")
        mat = _out_file("mat.tsv")
        # If matrix has a different format, give it the right extension
        if mat_fmt != "graal":
            mat = _out_file(mat_fmt)
    else:
        # Default graal file names
        fragments_list = _out_file("fragments_list.txt")
        info_contigs = _out_file("info_contigs.txt")
        mat = _out_file("abs_fragments_contacts_weighted.txt")
        if mat_fmt != "graal":
            mat = _out_file("abs_fragments_contacts_weighted." + mat_fmt)
    # Define what input files are given
    if start_stage == 0:
        reads1, reads2 = input1, input2
    elif start_stage == 1:
        bam1, bam2 = input1, input2
    elif start_stage == 2:
        pairs = input1
    elif start_stage == 3:
        pairs_idx = input1

    # Detect if multiple enzymes are given
    if re.search(",", enzyme):
        enzyme = enzyme.split(",")
    # Perform genome alignment
    if start_stage == 0:
        align_reads(
            reads1,
            genome,
            bam1,
            tmp_dir=tmp_dir,
            threads=threads,
            aligner=aligner,
            iterative=iterative,
            min_qual=min_qual,
            read_len=read_len,
        )
        align_reads(
            reads2,
            genome,
            bam2,
            tmp_dir=tmp_dir,
            threads=threads,
            aligner=aligner,
            iterative=iterative,
            min_qual=min_qual,
            read_len=read_len,
        )

    # Starting from bam files
    if start_stage <= 1:

        fragments_updated = True
        # Generate info_contigs and fragments_list output files
        hcd.write_frag_info(
            fasta,
            enzyme,
            min_size=min_size,
            circular=circular,
            output_contigs=info_contigs,
            output_frags=fragments_list,
        )

        # Log fragment size distribution
        hcd.frag_len(frags_file_name=fragments_list,
                     plot=plot,
                     fig_path=frag_plot)

        # Make pairs file (readID, chr1, chr2, pos1, pos2, strand1, strand2)
        bam2pairs(bam1, bam2, pairs, info_contigs, min_qual=min_qual)

    # Starting from pairs file
    if start_stage <= 2:
        restrict_table = {}
        for record in SeqIO.parse(fasta, "fasta"):
            # Get chromosome restriction table
            restrict_table[record.id] = hcd.get_restriction_table(
                record.seq, enzyme, circular=circular)

        # Add fragment index to pairs (readID, chr1, pos1, chr2,
        # pos2, strand1, strand2, frag1, frag2)
        hcd.attribute_fragments(pairs, pairs_idx, restrict_table)

    # Sort pairs file by coordinates for next steps
    hio.sort_pairs(
        pairs_idx,
        pairs_idx + ".sorted",
        keys=["chr1", "pos1", "chr2", "pos2"],
        threads=threads,
        tmp_dir=tmp_dir,
    )
    os.rename(pairs_idx + ".sorted", pairs_idx)

    if filter_events:
        uncut_thr, loop_thr = hcf.get_thresholds(pairs_idx,
                                                 plot_events=plot,
                                                 fig_path=dist_plot,
                                                 prefix=prefix)
        hcf.filter_events(
            pairs_idx,
            pairs_filtered,
            uncut_thr,
            loop_thr,
            plot_events=plot,
            fig_path=pie_plot,
            prefix=prefix,
        )
        use_pairs = pairs_filtered
    else:
        use_pairs = pairs_idx

    # Generate fragments file if it has not been already
    if not fragments_updated:
        hcd.write_frag_info(
            fasta,
            enzyme,
            min_size=min_size,
            circular=circular,
            output_contigs=info_contigs,
            output_frags=fragments_list,
        )

    # Generate distance law table if enabled
    if distance_law:
        out_distance_law = _out_file("distance_law.txt")
        if remove_centros is None:
            remove_centros = 0
        remove_centros = int(remove_centros)
        x_s, p_s, _ = hcdl.get_distance_law(
            pairs_idx,
            fragments_list,
            centro_file=centromeres,
            base=1.1,
            out_file=out_distance_law,
            circular=circular,
            rm_centro=remove_centros,
        )
        # Generate distance law figure is plots are enabled
        if plot:
            # Retrieve chrom labels from distance law file
            _, _, chr_labels = hcdl.import_distance_law(out_distance_law)
            chr_labels = [lab[0] for lab in chr_labels]
            chr_labels_idx = np.unique(chr_labels, return_index=True)[1]
            chr_labels = [
                chr_labels[index] for index in sorted(chr_labels_idx)
            ]
            p_s = hcdl.normalize_distance_law(x_s, p_s)
            hcdl.plot_ps_slope(x_s,
                               p_s,
                               labels=chr_labels,
                               fig_path=distance_law_plot)

    # Filter out PCR duplicates if requested
    if pcr_duplicates:
        filter_pcr_dup(use_pairs, pairs_pcr)
        use_pairs = pairs_pcr

    # Build matrix from pairs.
    if mat_fmt == "cool":
        # Name matrix file in .cool
        cool_file = os.path.splitext(mat)[0] + ".cool"
        pairs2cool(use_pairs, cool_file, fragments_list)
    else:
        pairs2matrix(
            use_pairs,
            mat,
            fragments_list,
            mat_fmt=mat_fmt,
            threads=threads,
            tmp_dir=tmp_dir,
        )

    # Clean temporary files
    if not no_cleanup:
        tempfiles = [
            pairs,
            pairs_idx,
            pairs_filtered,
            bam1,
            bam2,
            pairs_pcr,
            tmp_genome,
        ]
        # Do not delete files that were given as input
        try:
            tempfiles.remove(input1)
            tempfiles.remove(input2)
        except ValueError:
            pass
        for file in tempfiles:
            try:
                os.remove(file)
            except FileNotFoundError:
                pass

    end_time = datetime.now()
    duration = relativedelta(end_time, start_time)
    logger.info("Contact map generated after {h}h {m}m {s}s".format(
        h=duration.hours, m=duration.minutes, s=duration.seconds))
Ejemplo n.º 11
0
def iterative_align(
    fq_in,
    tmp_dir,
    ref,
    n_cpu,
    bam_out,
    aligner="bowtie2",
    min_len=20,
    min_qual=30,
    read_len=None,
):
    """Iterative alignment

    Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are
    truncated to the 20 first nucleotides and unmapped reads are extended by 20
    nucleotides and realigned on each iteration.

    Parameters
    ----------
    fq_in : str
        Path to input fastq file to align iteratively.
    tmp_dir : str
        Path where temporary files should be written.
    ref : str
        Path to the reference genome if Minimap2 is used for alignment.
        Path to the index genome if Bowtie2/bwa is used for alignment. 
    n_cpu : int
        The number of CPUs to use for the iterative alignment.
    bam_out : str
        Path where the final alignment should be written in BAM format.
    aligner : str
        Choose between minimap2, bwa or bowtie2 for the alignment.
    min_len : int
        The initial length of the fragments to align.
    min_qual : int
        Minimum mapping quality required to keep Hi-C pairs.
    read_len : int
        Read length in the fasta file. If set to None, the length of the first read
        is used. Set this value to the longest read length in the file if you have
        different read lengths.
        
    Examples
    --------
    iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2")
    iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2")
    """
    # set with the name of the unaligned reads :
    remaining_reads = set()
    total_reads = 0
    # Store path of SAM containing aligned reads at each iteration.
    iter_out = []

    # If there is already a file with the same name as the output file,
    # remove it. Otherwise, ignore.
    with contextlib.suppress(FileNotFoundError):
        try:
            os.remove(bam_out)
        except IsADirectoryError:
            logger.error("You need to give the BAM output file, not a folder.")
            raise

    # Bowtie only accepts uncompressed fastq: uncompress it into a temp file
    if aligner == "bowtie2" and hio.is_compressed(fq_in):
        uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp")
        with hio.read_compressed(fq_in) as inf:
            with open(uncomp_path, "w") as uncomp:
                st.copyfileobj(inf, uncomp)
    else:
        uncomp_path = fq_in

    # throw error if index does not exist
    index = hio.check_fasta_index(ref, mode=aligner)
    if index is None:
        logger.error(
            "Reference index is missing, please build the {} ".format(aligner),
            "index first.")
        sys.exit(1)
    # Counting reads
    with hio.read_compressed(uncomp_path) as inf:
        for _ in inf:
            total_reads += 1
    total_reads /= 4

    # Use first read to guess read length if not provided.
    if read_len is None:
        with hio.read_compressed(uncomp_path) as inf:
            # Skip first line (read header)
            size = inf.readline()
            # Stripping newline from sequence line.
            read_len = len(inf.readline().rstrip())

    # initial length of the fragments to align
    # In case reads are shorter than provided min_len
    if read_len > min_len:
        n = min_len
    else:
        logger.warning(
            "min_len is longer than the reads. Iterative mapping will have no effect."
        )
        n = read_len
    logger.info("{0} reads to parse".format(int(total_reads)))

    first_round = True
    # iterative alignment per se
    while n <= read_len:
        logger.info(
            "Truncating unaligned reads to {size}bp and mapping{again}.".
            format(size=int(n), again="" if first_round else " again"))
        iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))]
        # Generate a temporary input fastq file with the n first nucleotids
        # of the reads.
        truncated_reads = truncate_reads(tmp_dir, uncomp_path, remaining_reads,
                                         n, first_round)

        # Align the truncated reads on reference genome
        temp_alignment = join(tmp_dir, "temp_alignment.bam")
        map_args = {
            "fa": ref,
            "cpus": n_cpu,
            "fq": truncated_reads,
            "idx": index,
            "bam": temp_alignment,
        }
        if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE):
            cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args)
        elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE):
            cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args)
        elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE):
            cmd = ("bowtie2 -x {idx} -p {cpus}"
                   " --quiet --very-sensitive {fq}").format(**map_args)
        else:
            raise ValueError(
                "Unknown aligner. Select bowtie2, minimap2 or bwa.")

        map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
        sort_process = sp.Popen(
            "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args),
            shell=True,
            stdin=map_process.stdout,
        )
        out, err = sort_process.communicate()

        # filter the reads: the reads whose truncated end was aligned are written
        # to the output file.
        # The reads whose truncated end was not aligned are kept for the next round.
        remaining_reads = filter_bamfile(temp_alignment, iter_out[-1],
                                         min_qual)

        n += 20
        first_round = False

    # one last round without trimming
    logger.info("Trying to map unaligned reads at full length ({0}bp).".format(
        int(read_len)))

    truncated_reads = truncate_reads(
        tmp_dir,
        infile=uncomp_path,
        unaligned_set=remaining_reads,
        trunc_len=n,
        first_round=first_round,
    )
    if aligner == "minimap2" or aligner == "Minimap2":
        cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(
            fa=ref, cpus=n_cpu, fq=truncated_reads)
    elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA":
        cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format(idx=index,
                                                         cpus=n_cpu,
                                                         fq=truncated_reads)
    else:
        cmd = ("bowtie2 -x {idx} -p {cpus} --quiet "
               "--very-sensitive {fq}").format(idx=index,
                                               cpus=n_cpu,
                                               fq=truncated_reads)
    map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
    # Keep reads sorted by name
    sort_process = sp.Popen(
        "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(
            cpus=n_cpu, bam=temp_alignment),
        shell=True,
        stdin=map_process.stdout,
    )
    out, err = sort_process.communicate()
    iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))]
    remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual)

    # Report unaligned reads as well
    iter_out += [join(tmp_dir, "unaligned.bam")]
    temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False)
    unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam)
    for r in temp_bam:
        # Do not write supplementary alignments (keeping 1 alignment/read)
        if r.query_name in remaining_reads and not r.is_supplementary:
            unmapped.write(r)
    unmapped.close()
    temp_bam.close()

    # Merge all aligned reads and unmapped reads into a single bam
    ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out)
    logger.info("{0} reads aligned / {1} total reads.".format(
        int(total_reads - len(remaining_reads)), int(total_reads)))

    return 0
Ejemplo n.º 12
0
def filter_events(
    in_dat,
    out_filtered,
    thr_uncut,
    thr_loop,
    plot_events=False,
    fig_path=None,
    prefix=None,
):
    """Filter events (loops, uncuts and weirds)

    Filter out spurious intrachromosomal Hi-C pairs from input file. +- pairs
    with reads closer or at the uncut threshold and -+ pairs with reads closer
    or at the loop thresholds are excluded from the ouput file. -- and ++ pairs
    with both mates on the same fragments are also discarded. All others are
    written.

    Parameters
    ----------
    in_dat : file object
        File handle in read mode to the 2D BED file containing Hi-C pairs.
    out_filtered : file object
        File handle in write mode the output filtered 2D BED file.
    thr_uncut : int
        Minimum number of restriction sites between reads to keep an
        intrachromosomal +- pair.
    thr_loop : int
        Minimum number of restriction sites between reads to keep an
        intrachromosomal -+ pair.
    plot_events : bool
        If True, a plot showing the proportion of each type of event will be
        shown after filtering.
    fig_path : str
        Path where the figure will be saved. If None, figure is displayed
        interactively.
    prefix : str
        If the library has a name, it will be shown on plots.
    """
    n_uncuts = 0
    n_loops = 0
    n_weirds = 0
    lrange_intra = 0
    lrange_inter = 0

    # open the files for reading and writing
    with open(in_dat, "r") as pairs, open(out_filtered, "w") as filtered:
        for line in pairs:  # iterate over each line
            # Copy header lines to output
            if line.startswith("#"):
                filtered.write(line)
                continue

            p = process_read_pair(line)
            line_to_write = ("\t".join(
                map(
                    str,
                    (
                        p["readID"],
                        p["chr1"],
                        p["pos1"],
                        p["chr2"],
                        p["pos2"],
                        p["strand1"],
                        p["strand2"],
                        p["frag1"],
                        p["frag2"],
                    ),
                )) + "\n")
            if p["chr1"] == p["chr2"]:
                # Do not report ++ and -- pairs on the same fragment (impossible)
                if p["frag1"] == p["frag2"] and p["strand1"] == p["strand2"]:
                    n_weirds += 1
                elif p["nsites"] <= thr_loop and p["type"] == "-+":
                    n_loops += 1
                elif p["nsites"] <= thr_uncut and p["type"] == "+-":
                    n_uncuts += 1
                else:
                    lrange_intra += 1
                    filtered.write(line_to_write)

            if p["chr1"] != p["chr2"]:
                lrange_inter += 1
                filtered.write(line_to_write)

    if lrange_inter > 0:
        ratio_inter = round(
            100 * lrange_inter / float(lrange_intra + lrange_inter), 2)
    else:
        ratio_inter = 0

    # Log quick summary of operation results
    kept = lrange_intra + lrange_inter
    discarded = n_loops + n_uncuts + n_weirds
    total = kept + discarded
    logger.info("Proportion of inter contacts: {0}% (intra: {1}, "
                "inter: {2})".format(ratio_inter, lrange_intra, lrange_inter))
    logger.info(
        "{0} pairs discarded: Loops: {1}, Uncuts: {2}, Weirds: {3}".format(
            discarded, n_loops, n_uncuts, n_weirds))
    logger.info("{0} pairs kept ({1}%)".format(
        kept, round(100 * kept / (kept + discarded), 2)))

    # Visualize summary if requested by user
    if plot_events:
        try:
            # Plot: make a square figure and axes to plot a pieChart:
            plt.figure(2, figsize=(6, 6))
            # The slices will be ordered and plotted counter-clockwise.
            fracs = [n_uncuts, n_loops, n_weirds, lrange_intra, lrange_inter]
            # Format labels to include event names and proportion
            labels = list(
                map(
                    lambda x: (x[0] + ": %.2f%%") % (100 * x[1] / total),
                    [
                        ("Uncuts", n_uncuts),
                        ("Loops", n_loops),
                        ("Weirds", n_weirds),
                        ("3D intra", lrange_intra),
                        ("3D inter", lrange_inter),
                    ],
                ))
            colors = ["salmon", "lightskyblue", "yellow", "palegreen", "plum"]
            patches, _ = plt.pie(fracs, colors=colors, startangle=90)
            plt.legend(
                patches,
                labels,
                loc='upper left',
                bbox_to_anchor=(-0.1, 1.),
            )
            if prefix:
                plt.title(
                    "Distribution of library events in {}".format(prefix),
                    bbox={
                        "facecolor": "1.0",
                        "pad": 5
                    },
                )
            plt.text(
                0.3,
                1.15,
                "Threshold Uncuts = " + str(thr_uncut),
                fontdict=None,
            )
            plt.text(
                0.3,
                1.05,
                "Threshold Loops = " + str(thr_loop),
                fontdict=None,
            )

            plt.text(
                -1.5,
                -1.2,
                "Total number of reads = " + str(total),
                fontdict=None,
            )
            plt.text(
                -1.5,
                -1.3,
                "Ratio inter/(intra+inter) = " + str(ratio_inter) + "%",
                fontdict=None,
            )
            percentage = round(
                100 * float(lrange_inter + lrange_intra) /
                (n_loops + n_uncuts + n_weirds + lrange_inter + lrange_intra))
            plt.text(
                -1.5,
                -1.4,
                "Selected reads = {0}%".format(percentage),
                fontdict=None,
            )
            if fig_path:
                plt.savefig(fig_path)
            else:
                plt.show()
            plt.clf()
        except Exception:
            logger.error(
                "Unable to show plots. Perhaps there is no Xserver running ?"
                "(might be due to windows environment) skipping figure "
                "generation.")
Ejemplo n.º 13
0
def get_thresholds(in_dat,
                   interactive=False,
                   plot_events=False,
                   fig_path=None,
                   prefix=None):
    """Guess distance threshold for event filtering

    Analyse the events in the first million of Hi-C pairs in the library, plot
    the occurrences of each event type according to number of restriction
    fragments, and ask user interactively for the minimum threshold for uncuts
    and loops.

    Parameters
    ----------
    in_dat: str
        Path to the .pairs file containing Hi-C pairs.
    interactive: bool
        If True, plots are diplayed and thresholds are required interactively.
    plot_events : bool
        Whether to show the plot
    fig_path : str
        Path where the figure will be saved. If None, the figure will be
        diplayed interactively.
    prefix : str
        If the library has a name, it will be shown on plots.
    
    Returns
    -------
    dictionary
        dictionary with keys "uncuts" and "loops" where the values are the
        corresponding thresholds entered by the user.
    """
    thr_uncut = None
    thr_loop = None
    max_sites = 50
    # Map of event -> legend name of event for intrachromosomal pairs.
    legend = {
        "++": "++ (weird)",
        "--": "-- (weird)",
        "+-": "+- (uncuts)",
        "-+": "-+ (loops)",
    }
    colors = {"++": "#222222", "+-": "r", "--": "#666666", "-+": "tab:orange"}
    n_events = {event: np.zeros(max_sites) for event in legend}
    i = 0
    # open the file for reading (just the first 1 000 000 lines)
    with open(in_dat, "r") as pairs:
        for line in pairs:
            # Skip header lines
            if line.startswith("#"):
                continue
            i += 1
            # Only use the first million pairs to estimate thresholds
            if i == 1000000:
                break
            # Process Hi-C pair into a dictionary
            p = process_read_pair(line)
            # Type of event and number of restriction site between reads
            etype = p["type"]
            nsites = p["nsites"]
            # Count number of events for intrachrom pairs
            if etype != "inter" and nsites < max_sites:
                n_events[etype][nsites] += 1

    def plot_event(n_events, legend, name):
        """Plot the frequency of a given event types over distance."""
        plt.xlim([-0.5, 15])
        plt.plot(
            range(n_events[name].shape[0]),
            n_events[name],
            "o-",
            label=legend[name],
            linewidth=2.0,
            c=colors[name],
        )

    if interactive:
        # PLot:
        try:
            plt.figure(0)
            for event in legend:
                plot_event(n_events, legend, event)
            plt.grid()
            plt.xlabel("Number of restriction fragment(s)")
            plt.ylabel("Number of events")
            plt.yscale("log")
            plt.legend()
            plt.show(block=False)

        except Exception:
            logger.error(
                "Unable to show plots, skipping figure generation. Perhaps "
                "there is no Xserver running ? (might be due to windows "
                "environment). Try running without the interactive option.")

        # Asks the user for appropriate thresholds
        print(
            "Please enter the number of restriction fragments separating "
            "reads in a Hi-C pair below or at which loops and "
            "uncuts events will be excluded\n",
            file=sys.stderr,
        )
        thr_uncut = int(input("Enter threshold for the uncuts events (+-):"))
        thr_loop = int(input("Enter threshold for the loops events (-+):"))
        try:
            plt.clf()
        except Exception:
            pass
    else:
        # Estimate thresholds from data
        for event in n_events:
            fixed = n_events[event]
            fixed[fixed == 0] = 1
            n_events[event] = fixed

        all_events = np.log(np.array(list(n_events.values())))
        # Compute median occurences at each restriction sites
        event_med = np.median(all_events, axis=0)
        # Compute MAD, to have a robust estimator of the expected deviation
        # from median at long distances
        mad = np.median(abs(all_events - event_med))
        exp_stdev = mad / 0.67449
        # Iterate over sites, from furthest to frag+2
        for site in range(max_sites)[:1:-1]:
            # For uncuts and loops, keep the last (closest) site where the
            # deviation from other events <= expected_stdev
            if (abs(np.log(n_events["+-"][site]) - event_med[site]) <=
                    exp_stdev):
                thr_uncut = site
            if (abs(np.log(n_events["-+"][site]) - event_med[site]) <=
                    exp_stdev):
                thr_loop = site
        if thr_uncut is None or thr_loop is None:
            raise ValueError(
                "The threshold for loops or uncut could not be estimated. "
                "Please try running with -i to investigate the problem.")
        logger.info("Filtering with thresholds: uncuts={0} loops={1}".format(
            thr_uncut, thr_loop))
        if plot_events:
            try:
                plt.figure(1)
                plt.xlim([-0.5, 15])
                # Draw colored lines for events to discard
                plt.plot(
                    range(0, thr_uncut + 1),
                    n_events["+-"][:thr_uncut + 1],
                    "o-",
                    c=colors["+-"],
                    label=legend["+-"],
                )
                plt.plot(
                    range(0, thr_loop + 1),
                    n_events["-+"][:thr_loop + 1],
                    "o-",
                    c=colors["-+"],
                    label=legend["-+"],
                )
                plt.plot(
                    range(0, 2),
                    n_events["--"][:2],
                    "o-",
                    c=colors["--"],
                    label=legend["--"],
                )
                plt.plot(
                    range(0, 2),
                    n_events["++"][:2],
                    "o-",
                    c=colors["++"],
                    label=legend["++"],
                )
                # Draw black lines for events to keep
                plt.plot(
                    range(thr_uncut, n_events["+-"].shape[0]),
                    n_events["+-"][thr_uncut:],
                    "o-",
                    range(thr_loop, n_events["-+"].shape[0]),
                    n_events["-+"][thr_loop:],
                    "o-",
                    range(1, n_events["--"].shape[0]),
                    n_events["--"][1:],
                    "o-",
                    range(1, n_events["++"].shape[0]),
                    n_events["++"][1:],
                    "o-",
                    label="kept",
                    linewidth=2.0,
                    c="g",
                )
                plt.grid()
                plt.xlabel("Number of restriction site(s)")
                plt.ylabel("Number of events")
                plt.yscale("log")
                # Remove duplicate "kept" entries in legend
                handles, labels = plt.gca().get_legend_handles_labels()
                by_label = OrderedDict(zip(labels, handles))
                plt.legend(by_label.values(), by_label.keys())
                # Show uncut and loop threshold as vertical lines
                plt.axvline(x=thr_loop, color=colors["-+"])
                plt.axvline(x=thr_uncut, color=colors["+-"])

                if prefix:
                    plt.title(
                        "Library events by distance in {}".format(prefix))
                plt.tight_layout()
                if fig_path:
                    plt.savefig(fig_path)
                else:
                    plt.show(block=False)
                # plt.clf()

            except Exception:
                logger.error(
                    "Unable to show plots, skipping figure generation. Is "
                    "an X server running? (might be due to windows "
                    "environment). Try running without the plot option.")
    return thr_uncut, thr_loop