Exemple #1
0
def resolve(input_path, output_path, test_utils, genome, is_sc, is_careful):

    grp_filename = os.path.join(input_path, 'late_pair_info_counted.grp')
    sqn_filename = os.path.join(input_path, 'late_pair_info_counted.sqn')
    cvr_filename = os.path.join(input_path, 'late_pair_info_counted.cvr')
    first_prd_filename = os.path.join(input_path, 'late_pair_info_counted_0.prd')

    if experimental.filter != experimental.Filter.spades:
        prd_filename = first_prd_filename
    else:
        prd_filename = os.path.join(input_path, 'distance_estimation_0_cl.prd')
    if experimental.filter == experimental.Filter.pathsets:
        pst_filename = os.path.join(input_path, 'distance_estimation.pst')
    inf_filename = os.path.join(input_path, 'late_pair_info_counted_est_params.info')
    log_filename = os.path.join(output_path, 'rectangles.log')
    config = saveparser.config(inf_filename)

    d = config['median'] - config['RL']
    
    if d <= 0:
      print "Read length", config ['RL'], "is smaller than insert size", config['median'],", can't do anything"
      return
    
    makelogger(log_filename)
    logger = logging.getLogger('rectangles')

    logger.info("Rectangle Resolving %s..." % input_path)
    logger.info("d = %d..." % d)

    #################################
    # PARSE INITIAL BE BRUIJN GRAPH #
    #################################

    ingraph = Graph()
    ingraph.load(grp_filename, sqn_filename, cvr_filename)
    ingraph.check()
    logger.info("init rectangles set")
    rs = RectangleSet(ingraph, d, test_utils, prd_filename, first_prd_filename, config)
    if experimental.filter == experimental.Filter.pathsets:
        rs.pathsets(pst_filename)
    else:
        logger.info("begin filter")
        rs.filter(prd_filename, config)
    logger.info("  RectangleSet built.")
    
    threshold = 0.0
    logger.info("  Checking threshold %f..." % threshold)
    maxbgraph = rs.bgraph(threshold)
    save_fasta(maxbgraph, output_path, is_sc, 'begin_rectangles.fasta', is_careful)
    logger.info("outputed begin rectangles")
    maxbgraph.check_tips(ingraph.K)
    save_fasta(maxbgraph, output_path, is_sc, 'delete_tips.fasta', is_careful)
    logger.info("outputed delete tips")
    edges_before_loop = maxbgraph.delete_loops(ingraph.K, 1000, 10)
    save_fasta(maxbgraph, output_path, is_sc, "delete_tips_delete_loops_1000.fasta", is_careful)
    logger.info("outputed delete loops")
    edges_before_loop_DG = ingraph.find_loops(10, 1000, rs)
    logger.info("find DG 1000 loops")
    to_del = set(edges_before_loop_DG.keys()) & edges_before_loop
    for eid in to_del:
        del edges_before_loop_DG[eid]
    
    maxbgraph.delete_missing_loops(edges_before_loop_DG, ingraph.K, 1000, 10)
    logger.info("delete missing loops")
    save_fasta(maxbgraph, output_path, is_sc, 'delete_tips_delete_all_loops_1000.fasta', is_careful)
    
    edges_before_loop_DG = ingraph.find_loops(4, 10000, rs)
    to_del = set(edges_before_loop_DG.keys()) & edges_before_loop
    for eid in to_del:
        del edges_before_loop_DG[eid]
    
    maxbgraph.delete_missing_loops(edges_before_loop_DG, ingraph.K, 10000, 10)
    outgraph = save_fasta(maxbgraph, output_path, is_sc, "after_deleting_big_loops.fasta", is_careful)

    additional_paired_info = dict()
    should_connect = maxbgraph.edges_expand(5000)
    should_connect_by_first_pair_info = maxbgraph.use_scaffold_paired_info(2 * maxbgraph.d, rs.prd_for_scaffold)
    
    for (e1id, e2id) in should_connect_by_first_pair_info:
        if e1id not in additional_paired_info and maxbgraph.es[
                                                  e1id].conj.eid not in additional_paired_info and e2id not in additional_paired_info:
            additional_paired_info[e1id] = [maxbgraph.es[e1id], maxbgraph.es[e2id]]
            additional_paired_info[maxbgraph.es[e1id].conj.eid] = [maxbgraph.es[e2id].conj, maxbgraph.es[e1id].conj]
    
    outgraph.fasta_for_long_contigs(ingraph.K, maxbgraph.d, is_sc, is_careful,
        open(os.path.join(output_path, "rectangles_extend.fasta"), "w"), should_connect, additional_paired_info)
    outgraph.fasta_for_long_contigs(ingraph.K, maxbgraph.d, is_sc, is_careful,
        open(os.path.join(output_path, "rectangles_extend_before_scaffold.fasta"), "w"), should_connect, dict())

    outgraph.save(os.path.join(output_path, "last_graph"))
    
    if genome:
        check_diags.check(genome, maxbgraph, maxgraph.K, open(os.path.join(output_path, "check_log.txt"), "w"),
            test_utils)
Exemple #2
0
def resolve(input_path, output_path, test_utils, genome, is_sc):

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    grp_filename = os.path.join(input_path, 'late_pair_info_counted.grp')
    sqn_filename = os.path.join(input_path, 'late_pair_info_counted.sqn')
    cvr_filename = os.path.join(input_path, 'late_pair_info_counted.cvr')
    first_prd_filename = os.path.join(input_path, 'late_pair_info_counted.prd')
    prd_filename = os.path.join(input_path, 'late_pair_info_counted.prd' if experimental.filter != experimental.Filter.spades else 'distance_filling_cl.prd')
    pst_filename = os.path.join(input_path, 'distance_estimation.pst') if experimental.filter == experimental.Filter.pathsets else None
    inf_filename = os.path.join(input_path, 'late_pair_info_counted_est_params.info')
    log_filename = os.path.join(output_path, 'rectangles.log')
    config = saveparser.config(inf_filename)
    d = config.median - config.RL

    makelogger(log_filename)
    logger = logging.getLogger('rectangles')

    logger.info("Rectangle Resolving %s..." % input_path)
    logger.info("d = %d..." % d)

    #################################
    # PARSE INITIAL BE BRUIJN GRAPH #
    #################################

    ingraph = Graph()
    ingraph.load(grp_filename, sqn_filename, cvr_filename)
    ingraph.check()
    edges_before_loop_DG = ingraph.find_loops(10, 1000) 
    maxN50 = 0
    maxgraph = None
    maxbgraph = None
    maxthreshold = 0

    rs = RectangleSet(ingraph, d, test_utils, prd_filename, first_prd_filename, config)
    if experimental.filter == experimental.Filter.pathsets:
        rs.pathsets(pst_filename)
    else:
        rs.filter(prd_filename, config)
    logger.info("  RectangleSet built.")
    if experimental.filter == experimental.Filter.spades:
        thresholds = [0.0] # everything supported by paired info
    elif experimental.filter == experimental.Filter.pathsets:
        thresholds = [-1] # everything from pathsets file
    else:
        thresholds = rs.percentiles()
    logger.info("  Checking thresholds %s..." % thresholds)
    for threshold in set(thresholds):
        logger.info("  Checking threshold %f..." % threshold)
        bgraph = rs.bgraph(threshold)
        if not bgraph.diagonals:
            continue
        bgraph.build_missing_rectangles(ingraph.K, rs)
        bgraph.condense()
        outgraph = bgraph.project(output_path, is_sc)
        maxgraph = outgraph
        maxbgraph = bgraph
        maxthreshold = threshold

    maxgraph.fasta(open(os.path.join(output_path, 'begin_rectangles.fasta'), 'w'))
    #maxgraph.save(os.path.join(output_path, 'rectangles'))
    maxbgraph.save(output_path, ingraph.K)
    maxbgraph.check_tips(ingraph.K)
    outgraph = maxbgraph.project(output_path, is_sc)
    outgraph.fasta(open(os.path.join(output_path, 'delete_tips.fasta'), 'w'))
    edges_before_loop = maxbgraph.delete_loops(ingraph.K, 1000, 10)
    maxbgraph.condense()
    outgraph = maxbgraph.project(output_path, is_sc)
    outgraph.fasta(open(os.path.join(output_path,"delete_tips_delete_loops_1000.fasta"),"w"))
    to_del = set()
    for eid in edges_before_loop_DG:
          if eid in edges_before_loop:
            to_del.add(eid)
    print "to_del", len(to_del)
    for eid in to_del:
      del edges_before_loop_DG[eid]
    maxbgraph.delete_missing_loops(edges_before_loop_DG, ingraph.K, 1000, 10)
    maxbgraph.condense()
    outgraph = maxbgraph.project(output_path, is_sc)
    outgraph.fasta(open(os.path.join(output_path, 'delete_tips_delete_all_loops_1000.fasta'), 'w'))
    edges_before_loop_DG = ingraph.find_loops(4, 10000) 
    edges_before_loop_DG = edges_before_loop_DG or maxbgraph.delete_missing_loops(ingraph.K, 10000,10)
    to_del = set()
    for eid in edges_before_loop_DG:
          if eid in edges_before_loop:
            to_del.add(eid)
    print "to_del", len(to_del)
    for eid in to_del:
      del edges_before_loop_DG[eid]
    maxbgraph.delete_missing_loops(edges_before_loop_DG, ingraph.K, 10000, 10)
    maxbgraph.condense()
    outgraph = maxbgraph.project(output_path, is_sc)
    outgraph.fasta(open(os.path.join(output_path, "after_deleting_big_loops.fasta"), "w"))
    additional_paired_info = dict()
    should_connect = maxbgraph.edges_expand(5000)
    should_connect_by_first_pair_info = maxbgraph.use_scaffold_paired_info(2 * maxbgraph.d, rs.additional_prd)
    for (e1id, e2id) in should_connect_by_first_pair_info:
      if e1id not in additional_paired_info and maxbgraph.es[e1id].conj.eid not in additional_paired_info and e2id not in additional_paired_info:
        additional_paired_info[e1id] = [maxbgraph.es[e1id], maxbgraph.es[e2id]]
        additional_paired_info[maxbgraph.es[e1id].conj.eid] = [maxbgraph.es[e2id].conj, maxbgraph.es[e1id].conj]
    outgraph.fasta_for_long_contigs(ingraph.K, maxbgraph.d, is_sc, open(os.path.join(output_path,"rectangles_extend.fasta"),"w"), should_connect, additional_paired_info)
    outgraph.fasta_for_long_contigs(ingraph.K, maxbgraph.d, is_sc, open(os.path.join(output_path,"rectangles_extend_before_scaffold.fasta"),"w"), should_connect, dict())
    
    maxbgraph.print_about_edges([20586, 23014, 23806, 19630,23350], ingraph.K)
    outgraph.save(os.path.join(output_path,"last_graph"))
    if genome:  
      check_diags.check(genome, maxbgraph, maxgraph.K, open(os.path.join(output_path, "check_log.txt"), "w"), test_utils) 

    logger.info("Best Threshold = %d" % maxthreshold)
    logger.info("Best N50 = %d" % maxN50)
Exemple #3
0
def resolve(input_path, output_path, test_utils, genome, is_sc, is_careful):

    grp_filename = os.path.join(input_path, 'late_pair_info_counted.grp')
    sqn_filename = os.path.join(input_path, 'late_pair_info_counted.sqn')
    cvr_filename = os.path.join(input_path, 'late_pair_info_counted.cvr')
    first_prd_filename = os.path.join(input_path,
                                      'late_pair_info_counted_0.prd')

    if experimental.filter != experimental.Filter.spades:
        prd_filename = first_prd_filename
    else:
        prd_filename = os.path.join(input_path, 'distance_estimation_0_cl.prd')
    if experimental.filter == experimental.Filter.pathsets:
        pst_filename = os.path.join(input_path, 'distance_estimation.pst')
    inf_filename = os.path.join(input_path,
                                'late_pair_info_counted_est_params.info')
    log_filename = os.path.join(output_path, 'rectangles.log')
    config = saveparser.config(inf_filename)

    d = config['median'] - config['RL']

    if d <= 0:
        print "Read length", config[
            'RL'], "is smaller than insert size", config[
                'median'], ", can't do anything"
        return

    makelogger(log_filename)
    logger = logging.getLogger('rectangles')

    logger.info("Rectangle Resolving %s..." % input_path)
    logger.info("d = %d..." % d)

    #################################
    # PARSE INITIAL BE BRUIJN GRAPH #
    #################################

    ingraph = Graph()
    ingraph.load(grp_filename, sqn_filename, cvr_filename)
    ingraph.check()
    logger.info("init rectangles set")
    rs = RectangleSet(ingraph, d, test_utils, prd_filename, first_prd_filename,
                      config)
    if experimental.filter == experimental.Filter.pathsets:
        rs.pathsets(pst_filename)
    else:
        logger.info("begin filter")
        rs.filter(prd_filename, config)
    logger.info("  RectangleSet built.")

    threshold = 0.0
    logger.info("  Checking threshold %f..." % threshold)
    maxbgraph = rs.bgraph(threshold)
    save_fasta(maxbgraph, output_path, is_sc, 'begin_rectangles.fasta',
               is_careful)
    logger.info("outputed begin rectangles")
    maxbgraph.check_tips(ingraph.K)
    save_fasta(maxbgraph, output_path, is_sc, 'delete_tips.fasta', is_careful)
    logger.info("outputed delete tips")
    edges_before_loop = maxbgraph.delete_loops(ingraph.K, 1000, 10)
    save_fasta(maxbgraph, output_path, is_sc,
               "delete_tips_delete_loops_1000.fasta", is_careful)
    logger.info("outputed delete loops")
    edges_before_loop_DG = ingraph.find_loops(10, 1000, rs)
    logger.info("find DG 1000 loops")
    to_del = set(edges_before_loop_DG.keys()) & edges_before_loop
    for eid in to_del:
        del edges_before_loop_DG[eid]

    maxbgraph.delete_missing_loops(edges_before_loop_DG, ingraph.K, 1000, 10)
    logger.info("delete missing loops")
    save_fasta(maxbgraph, output_path, is_sc,
               'delete_tips_delete_all_loops_1000.fasta', is_careful)

    edges_before_loop_DG = ingraph.find_loops(4, 10000, rs)
    to_del = set(edges_before_loop_DG.keys()) & edges_before_loop
    for eid in to_del:
        del edges_before_loop_DG[eid]

    maxbgraph.delete_missing_loops(edges_before_loop_DG, ingraph.K, 10000, 10)
    outgraph = save_fasta(maxbgraph, output_path, is_sc,
                          "after_deleting_big_loops.fasta", is_careful)

    additional_paired_info = dict()
    should_connect = maxbgraph.edges_expand(5000)
    should_connect_by_first_pair_info = maxbgraph.use_scaffold_paired_info(
        2 * maxbgraph.d, rs.prd_for_scaffold)

    for (e1id, e2id) in should_connect_by_first_pair_info:
        if e1id not in additional_paired_info and maxbgraph.es[
                e1id].conj.eid not in additional_paired_info and e2id not in additional_paired_info:
            additional_paired_info[e1id] = [
                maxbgraph.es[e1id], maxbgraph.es[e2id]
            ]
            additional_paired_info[maxbgraph.es[e1id].conj.eid] = [
                maxbgraph.es[e2id].conj, maxbgraph.es[e1id].conj
            ]

    outgraph.fasta_for_long_contigs(
        ingraph.K, maxbgraph.d, is_sc, is_careful,
        open(os.path.join(output_path, "rectangles_extend.fasta"), "w"),
        should_connect, additional_paired_info)
    outgraph.fasta_for_long_contigs(
        ingraph.K, maxbgraph.d, is_sc, is_careful,
        open(
            os.path.join(output_path,
                         "rectangles_extend_before_scaffold.fasta"), "w"),
        should_connect, dict())

    outgraph.save(os.path.join(output_path, "last_graph"))

    if genome:
        check_diags.check(
            genome, maxbgraph, maxgraph.K,
            open(os.path.join(output_path, "check_log.txt"), "w"), test_utils)