コード例 #1
0
ファイル: rrr.py プロジェクト: ablab/rectangles
def resolve(input_path, output_path, test_utils, genome, is_sc):

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    grp_filename = os.path.join(input_path, 'late_pair_info_counted.grp')
    sqn_filename = os.path.join(input_path, 'late_pair_info_counted.sqn')
    cvr_filename = os.path.join(input_path, 'late_pair_info_counted.cvr')
    first_prd_filename = os.path.join(input_path, 'late_pair_info_counted.prd')
    prd_filename = os.path.join(input_path, 'late_pair_info_counted.prd' if experimental.filter != experimental.Filter.spades else 'distance_filling_cl.prd')
    pst_filename = os.path.join(input_path, 'distance_estimation.pst') if experimental.filter == experimental.Filter.pathsets else None
    inf_filename = os.path.join(input_path, 'late_pair_info_counted_est_params.info')
    log_filename = os.path.join(output_path, 'rectangles.log')
    config = saveparser.config(inf_filename)
    d = config.median - config.RL

    makelogger(log_filename)
    logger = logging.getLogger('rectangles')

    logger.info("Rectangle Resolving %s..." % input_path)
    logger.info("d = %d..." % d)

    #################################
    # PARSE INITIAL BE BRUIJN GRAPH #
    #################################

    ingraph = Graph()
    ingraph.load(grp_filename, sqn_filename, cvr_filename)
    ingraph.check()
    edges_before_loop_DG = ingraph.find_loops(10, 1000) 
    maxN50 = 0
    maxgraph = None
    maxbgraph = None
    maxthreshold = 0

    rs = RectangleSet(ingraph, d, test_utils, prd_filename, first_prd_filename, config)
    if experimental.filter == experimental.Filter.pathsets:
        rs.pathsets(pst_filename)
    else:
        rs.filter(prd_filename, config)
    logger.info("  RectangleSet built.")
    if experimental.filter == experimental.Filter.spades:
        thresholds = [0.0] # everything supported by paired info
    elif experimental.filter == experimental.Filter.pathsets:
        thresholds = [-1] # everything from pathsets file
    else:
        thresholds = rs.percentiles()
    logger.info("  Checking thresholds %s..." % thresholds)
    for threshold in set(thresholds):
        logger.info("  Checking threshold %f..." % threshold)
        bgraph = rs.bgraph(threshold)
        if not bgraph.diagonals:
            continue
        bgraph.build_missing_rectangles(ingraph.K, rs)
        bgraph.condense()
        outgraph = bgraph.project(output_path, is_sc)
        maxgraph = outgraph
        maxbgraph = bgraph
        maxthreshold = threshold

    maxgraph.fasta(open(os.path.join(output_path, 'begin_rectangles.fasta'), 'w'))
    #maxgraph.save(os.path.join(output_path, 'rectangles'))
    maxbgraph.save(output_path, ingraph.K)
    maxbgraph.check_tips(ingraph.K)
    outgraph = maxbgraph.project(output_path, is_sc)
    outgraph.fasta(open(os.path.join(output_path, 'delete_tips.fasta'), 'w'))
    edges_before_loop = maxbgraph.delete_loops(ingraph.K, 1000, 10)
    maxbgraph.condense()
    outgraph = maxbgraph.project(output_path, is_sc)
    outgraph.fasta(open(os.path.join(output_path,"delete_tips_delete_loops_1000.fasta"),"w"))
    to_del = set()
    for eid in edges_before_loop_DG:
          if eid in edges_before_loop:
            to_del.add(eid)
    print "to_del", len(to_del)
    for eid in to_del:
      del edges_before_loop_DG[eid]
    maxbgraph.delete_missing_loops(edges_before_loop_DG, ingraph.K, 1000, 10)
    maxbgraph.condense()
    outgraph = maxbgraph.project(output_path, is_sc)
    outgraph.fasta(open(os.path.join(output_path, 'delete_tips_delete_all_loops_1000.fasta'), 'w'))
    edges_before_loop_DG = ingraph.find_loops(4, 10000) 
    edges_before_loop_DG = edges_before_loop_DG or maxbgraph.delete_missing_loops(ingraph.K, 10000,10)
    to_del = set()
    for eid in edges_before_loop_DG:
          if eid in edges_before_loop:
            to_del.add(eid)
    print "to_del", len(to_del)
    for eid in to_del:
      del edges_before_loop_DG[eid]
    maxbgraph.delete_missing_loops(edges_before_loop_DG, ingraph.K, 10000, 10)
    maxbgraph.condense()
    outgraph = maxbgraph.project(output_path, is_sc)
    outgraph.fasta(open(os.path.join(output_path, "after_deleting_big_loops.fasta"), "w"))
    additional_paired_info = dict()
    should_connect = maxbgraph.edges_expand(5000)
    should_connect_by_first_pair_info = maxbgraph.use_scaffold_paired_info(2 * maxbgraph.d, rs.additional_prd)
    for (e1id, e2id) in should_connect_by_first_pair_info:
      if e1id not in additional_paired_info and maxbgraph.es[e1id].conj.eid not in additional_paired_info and e2id not in additional_paired_info:
        additional_paired_info[e1id] = [maxbgraph.es[e1id], maxbgraph.es[e2id]]
        additional_paired_info[maxbgraph.es[e1id].conj.eid] = [maxbgraph.es[e2id].conj, maxbgraph.es[e1id].conj]
    outgraph.fasta_for_long_contigs(ingraph.K, maxbgraph.d, is_sc, open(os.path.join(output_path,"rectangles_extend.fasta"),"w"), should_connect, additional_paired_info)
    outgraph.fasta_for_long_contigs(ingraph.K, maxbgraph.d, is_sc, open(os.path.join(output_path,"rectangles_extend_before_scaffold.fasta"),"w"), should_connect, dict())
    
    maxbgraph.print_about_edges([20586, 23014, 23806, 19630,23350], ingraph.K)
    outgraph.save(os.path.join(output_path,"last_graph"))
    if genome:  
      check_diags.check(genome, maxbgraph, maxgraph.K, open(os.path.join(output_path, "check_log.txt"), "w"), test_utils) 

    logger.info("Best Threshold = %d" % maxthreshold)
    logger.info("Best N50 = %d" % maxN50)