Ejemplo n.º 1
0
def extract_subnetworks(
    partition_file,
    network_file,
    output_dir,
    max_cores=DEFAULT_MAX_CORES,
    max_size_matrix=DEFAULT_MAX_SIZE_MATRIX,
    saturation_threshold=DEFAULT_SATURATION_THRESHOLD,
):
    """Extract bin subnetworks from the main network

    Identify bins, extract subnets, draws the adjacency matrices,
    saves it all in a specified output directory.

    Parameters
    ----------
    partition_file : file, str or pathlib.Path
        The file containing, for each chunk, the communities it was
        assigned to at each iteration.
    network_file : file, str or pathlib.Path
        The file containing the network in sparse (edge list) format
    output_dir : str or pathlib.Path
        The output directory to write the subnetworks into.
    max_cores : int, optional
        The maximum number of bins to extract. Default is 100.
    max_size_matrix : int, optional
        When rendering contact maps for each bin, the maximum size for the
        matrix. Default is 2000.
    saturation_threshold : float, optional
        When rendering contact maps for each bin, the percentile value over
        which the color map should be saturated. Default is 80.
    """

    logger.info("Loading partition...")
    data_chunks = np.loadtxt(partition_file, usecols=(1, ), dtype=np.int32)

    logger.info("Loading network...")
    network = np.loadtxt(network_file, dtype=np.int32)
    cores = data_chunks

    core_network = np.copy(network)

    core_network[:, 0] = cores[network[:, 0]]
    core_network[:, 1] = cores[network[:, 1]]

    n = np.amax(cores) + 1

    def extract(network_to_keep, filename):

        subnetwork = np.copy(network[network_to_keep])
        subnetwork[:, 0] -= 1
        subnetwork[:, 1] -= 1

        np.savetxt(filename, subnetwork, fmt="%i")

        return subnetwork

    def draw(subnetwork, filename):

        try:
            # Numpy array format
            row = subnetwork[:, 0]
            col = subnetwork[:, 1]
            data = subnetwork[:, 2]
        except TypeError:
            # Scipy sparse format
            row = subnetwork.row
            col = subnetwork.col
            data = subnetwork.data

        row_indices = stats.rankdata(np.concatenate((row, col)),
                                     method="dense")
        col_indices = stats.rankdata(np.concatenate((col, row)),
                                     method="dense")
        data = np.concatenate((data, data))

        # print("Row length: {}, col length: {}, data length: {}"
        #       "".format(len(row_indices), len(col_indices), len(data)))

        unique_row = np.unique(row)
        unique_col = np.unique(col)

        # print("Network shape: {},{}".format(len(unique_row),
        #                                     len(unique_col)))

        size = len(np.unique(np.concatenate((unique_row, unique_col)))) + 1
        # print("Size of matrix to draw: {}".format(size))

        try:
            sparse_subnet = sparse.coo_matrix(
                (data, (row_indices, col_indices)), shape=(size, size))
            binning_factor = (size // max_size_matrix) + 1
            binned_subnet = hcs.bin_sparse(sparse_subnet,
                                           subsampling_factor=binning_factor)
            dense_subnet = binned_subnet.todense()

            diagonal = np.diag(np.diag(dense_subnet))
            normed_subnet = hcs.normalize_dense(dense_subnet - diagonal)

            vmax = np.percentile(normed_subnet, saturation_threshold)

            spaceless_pdf_plot_maker(normed_subnet, filename, vmax=vmax)

        except MemoryError:
            logger.warning(
                "Warning, couldn't save matrix due to memory issues")

    def extract_and_draw(network_to_keep, filename_text, filename_image):

        subnetwork = extract(network_to_keep, filename=filename_text)
        draw(subnetwork, filename=filename_image)

    #   Extract and draw subnetworks for chosen cores and draw 2D arrays
    global_network_indices_list = []

    for i in range(1, n):

        if i > max_cores:
            break

        # print("Bin {}:".format(i))
        network_to_keep_1 = core_network[:, 0] == i
        network_to_keep_2 = core_network[:, 1] == i

        network_to_keep = network_to_keep_1 * network_to_keep_2

        nonzero_indices, = np.nonzero(network_to_keep)
        global_network_indices_list += nonzero_indices.tolist()

        subnetwork_file = os.path.join(output_dir,
                                       "subnetwork_core_{}.dat".format(i))

        image_name = os.path.join(output_dir, "core_{}.eps".format(i))

        extract_and_draw(
            network_to_keep=network_to_keep,
            filename_text=subnetwork_file,
            filename_image=image_name,
        )
Ejemplo n.º 2
0
def alignment_to_contacts(
    sam_merged,
    assembly,
    output_dir,
    output_file_network=DEFAULT_NETWORK_FILE_NAME,
    output_file_chunk_data=DEFAULT_CHUNK_DATA_FILE_NAME,
    parameters=DEFAULT_PARAMETERS,
):
    """Generates a network file (in edgelist form) from an
    alignment in sam or bam format. Contigs are virtually split into
    'chunks' of nearly fixed size (by default between 500 and 1000 bp)
    to reduce size bias. The chunks are the network nodes and the edges
    are the contact counts.

    The network is in a strict barebone form so that it can be reused and
    imported quickly into other applications etc. Verbose information about
    every single node in the network is written on a 'chunk data' file,
    by default called 'idx_contig_hit_size_cov.txt'

    Parameters
    ----------
    sam_merged : file, str or pathlib.Path
        The alignment file in SAM/BAM format to be processed.
    assembly : file, str or pathlib.Path
        The initial assembly acting as the alignment file's reference genome.
    output_dir : str or pathlib.Path
        The output directory to write the network and chunk data into.
    output_dir_file_network : str or pathlib.Path, optional
        The specific file name for the output network file. Default is
        network.txt
    output_file_chunk_data : str or pathlib.Path, optional
        The specific file name for the output chunk data file. Default is
        idx_contig_hit_size_cov.txt
    parameters : dict, optional
        A dictionary of parameters for converting the alignment file into a
        network. These are:
        -size_chunk_threshold: the size (in bp) under which chunks are
        discarded. Default is 500.
        -mapq_threshold: the mapping quality under which alignments are
        discarded. Default is 10.
        -chunk_size: the default chunk size (in bp) when applicable, save
        smaller contigs or tail-ends. Default is 1000.
        -read_size: the size of reads used for mapping. Default is 65.
        -self_contacts: whether to count alignments between a chunk and
        itself. Default is False.
        -normalized: whether to normalize contacts by their coverage.
        Default is False.

    Returns
    -------
    chunk_complete_data : dict
        A dictionary where the keys are chunks in (contig, position) form and
        the values are their id, name, total contact count, size and coverage.
    all_contacts : dict
        A counter dictionary where the keys are chunk pairs and the values are
        their contact count.
    """

    all_contacts = collections.Counter()
    all_chunks = collections.Counter()

    #   Initialize parameters
    chunk_size = int(parameters["chunk_size"])
    mapq_threshold = int(parameters["mapq_threshold"])
    size_chunk_threshold = int(parameters["size_chunk_threshold"])
    read_size = int(parameters["read_size"])
    self_contacts = parameters["self_contacts"]
    normalized = parameters["normalized"]

    logger.info("Establishing chunk list...")
    chunk_complete_data = dict()

    #   Get all information about all chunks from all contigs
    #   (this gets updated at the end)
    global_id = 1
    for record in SeqIO.parse(assembly, "fasta"):
        length = len(record.seq)

        n_chunks = length // chunk_size
        n_chunks += (length % chunk_size) >= size_chunk_threshold

        for i in range(n_chunks):

            if (i + 1) * chunk_size <= length:
                size = chunk_size
            else:
                size = length % chunk_size

            chunk_name = "{}_{}".format(record.id, i)
            chunk_complete_data[chunk_name] = {
                "id": global_id,
                "hit": 0,
                "size": size,
                "coverage": 0,
            }
            global_id += 1

    logger.info("Opening alignment files...")

    current_read = None

    # Read the BAM file to detect contacts.
    with pysam.AlignmentFile(sam_merged, "rb") as alignment_merged_handle:

        names = alignment_merged_handle.references
        lengths = alignment_merged_handle.lengths
        names_and_lengths = {
            name: length
            for name, length in zip(names, lengths)
        }

        logger.info("Reading contacts...")

        # Since the BAM file is supposed to be sorted and interleaved,
        # pairs should be always grouped with one below the other (the exact
        # order doesn't matter since the network is symmetric, so we simply
        # treat the first one as 'forward' and the second one as 'reverse')

        # We keep iterating until two consecutive reads have the same name,
        # discarding ones that don't.

        while "Reading forward and reverse alignments alternatively":
            try:
                my_read = next(alignment_merged_handle)
                if current_read is None:
                    # First read
                    current_read = my_read
                    continue

                elif current_read.query_name != my_read.query_name:

                    # print("{}_{}".format(current_read, my_read))
                    current_read = my_read
                    continue

                read_forward, read_reverse = current_read, my_read

            except StopIteration:
                break

            # Get a bunch of info about the alignments to pass the tests below
            read_name_forward = read_forward.query_name
            read_name_reverse = read_reverse.query_name

            flag_forward, flag_reverse = read_forward.flag, read_reverse.flag

            try:
                assert read_name_forward == read_name_reverse
            except AssertionError:
                logger.error(
                    "Reads don't have the same name: "
                    "%s and %s",
                    read_name_forward,
                    read_name_reverse,
                )
                raise

            # To check if a flag contains 4
            # (digit on the third position from the right in base 2),
            # 4 = unmapped in SAM spec
            def is_unmapped(flag):
                return np.base_repr(flag, padding=3)[-3] == "1"

            if is_unmapped(flag_forward) or is_unmapped(flag_reverse):
                # print("Detected unmapped read on one end, skipping")
                continue

            contig_name_forward = read_forward.reference_name
            contig_name_reverse = read_reverse.reference_name

            len_contig_for = names_and_lengths[contig_name_forward]
            len_contig_rev = names_and_lengths[contig_name_reverse]

            position_forward = read_forward.reference_start
            position_reverse = read_reverse.reference_start

            mapq_forward = read_forward.mapping_quality
            mapq_reverse = read_reverse.mapping_quality

            # Some more tests: checking for size, map quality, map status etc.
            mapq_test = min(mapq_forward, mapq_reverse) > mapq_threshold

            min_length = min(len_contig_for, len_contig_rev)
            length_test = min_length > size_chunk_threshold

            # Trickest test:
            #
            #
            #                contig
            #    pos1                          pos2
            #     ^                             ^
            # |-------|-------|-------|-------|---|
            # <-------><------><------><------><-->            <->
            #   chunk   chunk                  tail   size_chunk_threshold
            #
            # Test is passed if tail >= size_chunk_threshold (pos2)
            # or if the position is a non-tail chunk (pos1)

            if position_forward < chunk_size * (len_contig_for // chunk_size):
                current_chunk_forward_size = chunk_size
            else:
                current_chunk_forward_size = len_contig_for % chunk_size

            if position_reverse < chunk_size * (len_contig_rev // chunk_size):
                current_chunk_reverse_size = chunk_size
            else:
                current_chunk_reverse_size = len_contig_rev % chunk_size

            min_chunk_size = min(current_chunk_forward_size,
                                 current_chunk_reverse_size)

            chunk_test = min_chunk_size >= size_chunk_threshold

            if mapq_test and length_test and chunk_test:

                chunk_forward = position_forward // chunk_size
                chunk_reverse = position_reverse // chunk_size

                chunk_name_forward = "{}_{}".format(contig_name_forward,
                                                    chunk_forward)
                chunk_name_reverse = "{}_{}".format(contig_name_reverse,
                                                    chunk_reverse)

                if self_contacts or chunk_name_forward != chunk_name_reverse:

                    contact = tuple(
                        sorted((chunk_name_forward, chunk_name_reverse)))

                    all_contacts[contact] += 1

                    chunk_key_forward = (
                        chunk_name_forward,
                        current_chunk_forward_size,
                    )
                    all_chunks[chunk_key_forward] += 1

                    chunk_key_reverse = (
                        chunk_name_reverse,
                        current_chunk_reverse_size,
                    )
                    all_chunks[chunk_key_reverse] += 1

    logger.info("Writing chunk data...")

    # Now we can update the chunk dictionary
    # with the info we gathered from the BAM file

    output_chunk_data_path = os.path.join(output_dir, output_file_chunk_data)

    with open(output_chunk_data_path, "w") as chunk_data_file_handle:

        for name in sorted(chunk_complete_data.keys()):

            chunk_data = chunk_complete_data[name]
            size = chunk_data["size"]
            chunk = (name, chunk_data["size"])
            hit = all_chunks[chunk]
            coverage = hit * read_size * 1.0 / size
            try:
                chunk_complete_data[name]["hit"] = hit
                chunk_complete_data[name]["coverage"] = coverage
            except KeyError:
                logger.error("A mismatch was detected between the reference "
                             "genome and the genome used for the alignment "
                             "file, some sequence names were not found")
                raise

            idx = chunk_complete_data[name]["id"]
            line = "{}\t{}\t{}\t{}\t{}\n".format(idx, name, hit, size,
                                                 coverage)
            chunk_data_file_handle.write(line)

    # Lastly, generate the network proper

    logger.info("Writing network...")

    output_network_path = os.path.join(output_dir, output_file_network)

    with open(output_network_path, "w") as network_file_handle:

        for chunks in sorted(all_contacts.keys()):

            chunk_name1, chunk_name2 = chunks
            contact_count = all_contacts[chunks]

            if normalized:
                coverage1 = chunk_complete_data[chunk_name1]["coverage"]
                coverage2 = chunk_complete_data[chunk_name2]["coverage"]
                mean_coverage = np.sqrt(coverage1 * coverage2)
                effective_count = contact_count * 1.0 / mean_coverage
            else:
                effective_count = contact_count

            try:
                idx1 = chunk_complete_data[chunk_name1]["id"]
                idx2 = chunk_complete_data[chunk_name2]["id"]
                line = "{}\t{}\t{}\n".format(idx1, idx2, effective_count)
                network_file_handle.write(line)
            except KeyError as e:
                logger.warning("Mismatch detected: %s", e)

    return chunk_complete_data, all_contacts