Beispiel #1
0
def mp_build_scaffolds(paths, gapsize, n_proc):
    scaffold_sequences, scaffold_correspondences = {}, {}
    all_edges = []
    n_gaps, n_merges = 0, 0
    misc.printstatus("Number of paths: " + str(len(paths)))
    bed = {}  # To collect bed coordinates

    pool = multiprocessing.Pool(n_proc)
    result = pool.map_async(process_scaffold, paths)

    while not result.ready():
        misc.printstatusFlush(
            "[ SCAFFOLDING ]\t" +
            misc.reportProgress(len(paths) - result._number_left, len(paths)))
        time.sleep(4)

    # Get the result and remove Nones.
    mp_output = [i for i in result.get() if i]
    misc.printstatus("[ SCAFFOLDING ]\t" +
                     misc.reportProgress(len(paths), len(paths)))

    # Unpack multiprocessing data results
    for idx, dat in enumerate(mp_output):
        scaffold_sequence, included, ng, nm, bed_coords = dat[0], dat[1], dat[
            2], dat[3], dat[4]
        scaffold_sequences["scaffold_" + str(idx)] = scaffold_sequence
        scaffold_correspondences["scaffold_" + str(idx)] = included
        bed["scaffold_" + str(idx)] = bed_coords
        n_gaps += ng
        n_merges += nm

    return scaffold_sequences, scaffold_correspondences, all_edges, n_gaps, n_merges, bed
Beispiel #2
0
def fillJunctions(backbone_graph, GEMlist, barcode_factor):
    '''Fill the Linkgraph junctions with short contigs.

    Connections section of the linkgraph is filled using the barcodes of
    the junction. The short contigs with matching set of barcodes to the
    junction are inserted into the connections.

    Args:
        backbone_graph (Linkgraph)
        GEMlist (dict)
        barcode_factor (int)

    Returns:
        list: list of paths with junctions filled.
    '''

    filled_junction_paths = []

    # Iterate over paths and every junction in the path
    # Create a barcode comparison of the junction and all small contigs
    for idx, path in enumerate(backbone_graph.paths):
        # Report progress every 100 windows
        if idx in range(0, 10000000, 1):
            misc.printstatusFlush("[ PATH FILLING ]\t" + \
            misc.reportProgress(idx+1, len(backbone_graph.paths)))

        filled_path = []

        # Check outgoing edges from both start and target in full_graph.
        # If they are connected to both sides, add them to junction.
        for junction in path:
            tigs, fractions = zip(*[(k, graph_building.compareGEMlibs(junction.barcodes, v)) \
                                    for k,v in GEMlist.items()])
            fracs = pd.Series(fractions, index=tigs)
            fracs = fracs[fracs > 0]

            if len(fracs > 0):
                outliers = graph_building.calcOutliers(fracs, barcode_factor)

                # Old outlier method:
                #outliers = esd.getOutliers_QC(np.array(fractions),tigs,10)

                # Add any outliers to junction.connections
                filled_path.append( graph_building.Junction(junction.start, \
                                                            junction.target, \
                                                            junction.connections + \
                                                            [ o[:-1] for o in list(outliers.index)] ))

        filled_junction_paths.append(filled_path)

    misc.printstatus("[ PATH FILLING ]\t" + \
    misc.reportProgress(idx+1, len(backbone_graph.paths)))

    return filled_junction_paths
Beispiel #3
0
def main(input_bam, contig_dict, region_size=20000, mapq=60, bc_quant=2):
    global samfile
    samfile = pysam.AlignmentFile(input_bam, "rb")
    GEMlist = {}  # Inappropriately named "list"

    # First step is to collect all barcodes (passing -q cutoff) that are aligned
    # to each contigs first and last regions (-l)
    misc.printstatus("Starting barcode collection. Found {0} contigs.".format(
        len(contig_dict.keys())))

    # Generate windows
    windows = getWindows(region_size, contig_dict)

    # Iterate over windows to collect barcodes sets
    for idx, window in enumerate(windows):
        # Unpack variables, for readability
        region, contig, start, end = window[0], window[0][:-1], window[
            1], window[2]

        # Print progress. Number of windows is dependent on if running on
        # backbone or on small contigs.
        if idx in range(0, 100000000, 20):
            if region[-1] == "a":
                misc.printstatusFlush("[ BARCODE COLLECTION ]\t" + \
                misc.reportProgress(idx, len(contig_dict.keys())))
            else:
                misc.printstatusFlush("[ BARCODE COLLECTION ]\t" + \
                misc.reportProgress(idx, len(contig_dict.keys())*2))

        # Collect barcodes from the window
        GEMs = collectGEMs((contig, start, end), mapq, bc_quant)

        # If at least 100 barcodes in list, use it
        if len(GEMs) > 100:
            GEMlist[region] = GEMs

    if region[-1] == "a":
        misc.printstatus("[ BARCODE COLLECTION ]\t" + \
        misc.reportProgress(len(contig_dict.keys()), len(contig_dict.keys())))
    else:
        misc.printstatus("[ BARCODE COLLECTION ]\t" + \
        misc.reportProgress(len(contig_dict.keys())*2, len(contig_dict.keys())*2))
    samfile.close()

    return GEMlist
Beispiel #4
0
def build_scaffolds(paths, gapsize):
    scaffold_sequences, scaffold_correspondences = {}, {}
    all_edges = []
    n_gaps, n_merges = 0, 0
    misc.printstatus("Number of paths: " + str(len(paths)))
    bed = {}  # To collect bed coordinates

    for idx, path in enumerate(paths):
        misc.printstatusFlush("[ SCAFFOLDING ]\t" +
                              misc.reportProgress(idx + 1, len(paths)))

        # Collect all relevant sequences from fasta
        linked_contigs = [ [junction.start[:-1], junction.target[:-1]] + \
                            junction.connections for junction in path \
                            if junction.start and junction.target]
        linked_contigs = [
            step for partial_path in linked_contigs for step in partial_path
        ]

        # Start overlapping
        filled_path, edges = combine_paths(path)
        # It is possible that there is no filled_path, in the case that the
        # path had a single junction which had a None at junction.start or
        # junction.target and no overlaps were found. In this case, continue.
        if filled_path:
            all_edges.extend(edges)

            # Create scaffold
            scaffold_sequence, included, ng, nm, bed_coords = mergeSeq(
                filled_path, gapsize)
            scaffold_sequences["scaffold_" + str(idx)] = scaffold_sequence
            scaffold_correspondences["scaffold_" + str(idx)] = included
            bed["scaffold_" + str(idx)] = bed_coords
            n_gaps += ng
            n_merges += nm

    misc.printstatus("[ SCAFFOLDING ]\t" +
                     misc.reportProgress(idx + 1, len(paths)))

    return scaffold_sequences, scaffold_correspondences, all_edges, n_gaps, n_merges, bed
Beispiel #5
0
def pairwise_comparisons(GEMlist):
    '''
    Performs all pairwise comparisons between windows in GEMlist.

    Returns:
        GEMcomparison (pd.DataFrame)
    '''
    # Compare the barcodes in every region to all other regions
    GEMcomparison = pd.DataFrame(np.zeros(( len(GEMlist), len(GEMlist) )), \
                                index=GEMlist.keys())
    GEMcomparison.columns = GEMcomparison.index

    # Iterate over rows in GEMcomparison
    # Index to keep track of position so we can skip calculating some fractions
    # twice
    idx = 0
    for idx, region1 in enumerate(GEMcomparison.index):
        lib1 = GEMlist[region1]

        # Report progress every 20 windows
        if idx in range(0, 100000000, 20):
            misc.printstatusFlush("[ BARCODE COMPARISON ]\t" +
                                  misc.reportProgress(idx + 1, len(GEMlist)))

        fractions = [
            compareGEMlibs(lib1, GEMlist[col])
            for col in GEMcomparison.columns[idx:]
        ]

        GEMcomparison.loc[region1][
            idx:] = fractions  # Update row values from idx
        GEMcomparison[region1][
            idx:] = fractions  # Also update column values from idx

    misc.printstatus("[ BARCODE COMPARISON ]\t" +
                     misc.reportProgress(idx + 1, len(GEMlist)))

    return GEMcomparison
Beispiel #6
0
def makeEdges(GEMcomparison, barcode_factor, min_barcode_fraction):
    '''Create edges from the GEMcomparison dataframe.

    Args:
        GEMcomparison (pd.DataFrame): All-against-all comparison of the
            windows' barcodes.
        barcode_factor (int): Factor for calculating outliers.
        min_barcode_fraction (float): Minimum fraction of shared barcodes to create
            an edge in the linkgraph.
    Returns:
        list: Edges inferred from the fractions of shared barcodes.
    '''

    misc.printstatus("Number of windows: " + str(len(GEMcomparison.keys())))
    edges = []

    with open("fractions.txt", "w") as out:
        for f in GEMcomparison.index:
            out.write("{}\t".format(f))
        out.write("\n")

        # Iterate over rows in GEMcomparison
        for idx, (region, fractions) in enumerate(GEMcomparison.iterrows()):
            contig = region[:-1]
            window = region[-1]

            out.write(region + "\t")
            for f in fractions:
                out.write("{}\t".format(f))
            out.write("\n")

            # Report progress every 100 windows
            if idx in range(0, 10000000, 100):
                misc.printstatusFlush(
                    "[ BARCODE LINKING ]\t" +
                    misc.reportProgress(idx, len(GEMcomparison)))
            '''
            # Calculate outliers from the comparisons of window k to all other windows
            # outliers is a dict where each key is a connected window to region,
            # and value is the fraction of shared barcodes between region and window
            outliers = esd.getOutliers_QC(np.array(fractions),fractions.index,10)
            # Get rid of edges to the same contig.
            outliers = { k:v for k,v in outliers.items() if k[:-1] != region[:-1] \
                        and v > np.mean(fractions)}
            outliers = pd.Series(outliers)
            # If there are any outliers, i.e. edges to create, add them to the edges
            # list. Don't add edges for lower outliers (fractions < mean(fractions))
            # or where the fraction is less than
            # min_barcode_fraction (-f) and edges back to the same contig
            if len(outliers.keys()) > 1:
                sorted_outliers = outliers.sort_values(ascending = False)
                if sorted_outliers[0] > sorted_outliers[1] * barcode_factor:
                    outliers = outliers[outliers == sorted_outliers[0]]

            new_edges = [(region, connected_window, fraction) \
                        for connected_window, fraction in outliers.items()]

            # Let's try only writing single edges
            #if len(new_edges) == 1:
            for idx, mo in outliers.iteritems():
                edges.append( (region, idx, mo ) )

            '''

            # Ignore comparisons to the same contig and calculate outliers
            # In low coverage datasets the amount of 0's might cloud any
            # actual signal
            fractions = fractions.drop(labels=[contig + "s", contig + "e"],
                                       errors="ignore")
            fractions = fractions[fractions > 0]
            if len(fractions) > 0:
                minor_outliers = calcOutliers(fractions, barcode_factor)
                minor_outliers = minor_outliers[
                    minor_outliers > min_barcode_fraction]

                for ix, mo in minor_outliers.iteritems():
                    edges.append((region, ix, mo))

        misc.printstatus(
            "[ BARCODE LINKING ]\t" +
            misc.reportProgress(len(GEMcomparison), len(GEMcomparison)))

        return edges
Beispiel #7
0
def trimSequences(paths, mincov):
    '''Trim away low quality regions of input sequences

    Description:
        Because de novo assembled contigs often end in low quality regions
        that are of too poor sequence to find good overlaps between, we want to
        trim input contigs of regions where reads don't map. Only trim regions
        where there is a potential overlap, i.e. NOT at the start of the first
        contig and end of the last contig in a path.

    Args:
        paths (list): list of lists. Each nested list contains ordered
            graph_building.Junction objects.
        mincov (int): Trim contig ends with lower average coverage than this
            value
    Returns:
        dict: trimmed_fasta_coords. Keys: input contig headers, values:
            start and end coordinates to keep, in addition to True or False
            for start and end if they were trimmed or not.
    '''
    # trimmed_fasta_coords is a dict with coords to keep from original fasta
    # Format: {contig: [start_coord, end_coord, bool, bool]}
    # Start by filling with old coords, which will then be changed
    trimmed_fasta_coords = {}
    for idx, ctg in enumerate(fastafile.references):
        trimmed_fasta_coords[ctg] = [0, fastafile.lengths[idx], False, False]

    # Then find new coordinates for all sequences to merge
    for idx, path in enumerate(paths):
        if idx in range(0, 100000000, 5):
            misc.printstatusFlush("[ TRIMMING ]\t" +
                                  misc.reportProgress(idx + 1, len(paths)))

        for junction in path:
            if junction.start != None:
                start_tig, start_side = junction.start[:-1], junction.start[-1]
            else:
                start_tig, start_side = None, None
            if junction.target != None:
                target_tig, target_side = junction.target[:
                                                          -1], junction.target[
                                                              -1]
            else:
                target_tig, target_side = None, None
            connections = junction.connections

            # Trim the sides of contigs where a junction is formed,
            # and don't trim places where there are no junctions.
            if start_side == "s" \
            and trimmed_fasta_coords[start_tig][2] == False:
                trimmed_fasta_coords[start_tig] =   [trimmed_fasta_coords[start_tig][0] + \
                                                    trimFasta(trimmed_fasta_coords, junction.start, mincov), \
                                                    trimmed_fasta_coords[start_tig][1], \
                                                    True, \
                                                    trimmed_fasta_coords[start_tig][3]]
            elif start_side == "e" \
            and trimmed_fasta_coords[start_tig][3] == False:
                trimmed_fasta_coords[start_tig] =   [trimmed_fasta_coords[start_tig][0], \
                                                    trimmed_fasta_coords[start_tig][1] + \
                                                    trimFasta(trimmed_fasta_coords, junction.start, mincov), \
                                                    trimmed_fasta_coords[start_tig][2], \
                                                    True]
            if target_side == "s" \
            and trimmed_fasta_coords[target_tig][2] == False:
                trimmed_fasta_coords[target_tig] =  [trimmed_fasta_coords[target_tig][0] + \
                                                    trimFasta(trimmed_fasta_coords, junction.target, mincov), \
                                                    trimmed_fasta_coords[target_tig][1], \
                                                    True, \
                                                    trimmed_fasta_coords[target_tig][3]]
            elif target_side == "e" \
            and trimmed_fasta_coords[target_tig][3] == False:
                trimmed_fasta_coords[target_tig] =  [trimmed_fasta_coords[target_tig][0], \
                                                    trimmed_fasta_coords[target_tig][1] + \
                                                    trimFasta(trimmed_fasta_coords, junction.target, mincov), \
                                                    trimmed_fasta_coords[target_tig][2], \
                                                    True]

            # Also trim everything in connections
            for conn in connections:
                if not trimmed_fasta_coords[conn][2] == True \
                or not trimmed_fasta_coords[conn][3] == True:
                    trimmed_fasta_coords[conn] = [trimmed_fasta_coords[conn][0] + \
                                                trimFasta(trimmed_fasta_coords, conn+"s", mincov), \
                                                trimmed_fasta_coords[conn][1] + \
                                                trimFasta(trimmed_fasta_coords, conn+"e", mincov), \
                                                True, True]
    misc.printstatus("[ TRIMMING ]\t" +
                     misc.reportProgress(idx + 1, len(paths)))

    return trimmed_fasta_coords