Exemple #1
0
def mauve_pw_align(ref, query, dirs):
    """Set up and perform a pairwise alignment with Mauve."""
    # set outputs
    mauve_outfile = dirs['mauve']+ref.name+"_"+query.name+".mauve"
    segfile = dirs['aln_segs']+ref.name+"_"+query.name+"_segs.txt"
    # check for existing alignment
    if path.exists(segfile):
        print "already done"
    else:
        # prep segments file
        open(segfile, 'w').write('')
        # purge any pre-existing sslist files
        sslist_files = from_dir(dirs['seqfiles'], re.compile(r'.*\.sslist.*'))
        for sslist in sslist_files:
            try: os.remove(dirs['seqfiles']+sslist)
            except Exception: raise
        # do Mauve alignment
        file_list = [ref.gbk, query.gbk]
        align_mauve(file_list, mauve_outfile)
        try:
            # parse Mauve output (without initial clumping)
            coords = mauver_load2_k0(mauve_outfile+".backbone", 0)
            print "\nSegment results:", len(coords), '->',
            # chop segments that are too long
            chop_array = chop_rows(coords, max_size, chop_mode)
            print len(chop_array), 'segments <', max_size, 'bp'
            # make detailed pairwise alignments of the segments
            print "Aligning segments ..."
            ref_rec = load_genbank(ref.gbk)
            query_rec = load_genbank(query.gbk)
            id = iter_align(chop_array, ref_rec, query_rec,
                            dirs['aln_segs'], segfile)
            print "Results:", id, "% id. overall"
        except IOError:
            print "\nERROR: Mauve alignment failed"
Exemple #2
0
def mauve_pw_align(ref, query, r_root_dir, g_root_dir, dirs, run, max_size,
                   chop_mode, mauve_exec, mtype):
    """Set up and perform a pairwise alignment with Mauve."""
    aln_dir = r_root_dir + run + dirs['aln_segs']
    mauve_dir = r_root_dir + run + dirs['mauve']
    # set outputs
    mauve_outfile = mauve_dir + ref.name + "_" + query.name + ".mauve"
    segfile = aln_dir + ref.name + "_" + query.name + "_segs.txt"
    # check for existing alignment
    if path.exists(segfile):
        print "already done"
    else:
        # prep segments file
        open(segfile, 'w').write('')
        # purge any pre-existing sslist files
        sslist_files = from_dir(g_root_dir, re.compile(r'.*\.sslist.*'))
        for sslist in sslist_files:
            try:
                os.remove(g_root_dir + sslist)
            except Exception:
                raise
        # do Mauve alignment
        file_list = [ref.gbk, query.gbk]
        align_mauve(file_list, mauve_outfile, mauve_exec)
        try:
            # parse Mauve output (without initial clumping)
            coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype)
            print "\nSegment results:", len(coords), '->',
            # chop segments that are too long
            chop_array = chop_rows(coords, max_size, chop_mode, mtype)
            print len(chop_array), 'segments <', max_size, 'bp'
            # make detailed pairwise alignments of the segments
            print "Aligning segments ..."
            ref_rec = load_genbank(ref.gbk)
            query_rec = load_genbank(query.gbk)
            id = iter_align(chop_array, ref_rec, query_rec, aln_dir, segfile)
            print "Results:", id, "% id. overall"
        except IOError:
            print "\nERROR: Mauve alignment failed"
            raise
Exemple #3
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes,
                    run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ctgs_root = run_root + run_dirs['run_gbk_ctgs_dir'] + ref_n + "/"
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/"
    scaffolds_dir = run_root + run_dirs['scaffolds_dir'] + ref_n + "/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root + g_name + "/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root + g_name + "/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.fas"
        scaff_gbk = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1,
                                 dtype=[('ctg', 'i4'), ('start', 'i4'),
                                        ('end', 'i4'), ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "(" + ctg_num + ")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)

                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "(" + ctg_num + ")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)

        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir + ctg_num + ".mauve"
            bb_file = mauve_file + ".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(
                    anchors_array, 0,
                    (ctg_num, anchor_seg['start'], anchor_seg['end'],
                     anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1  # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir + g_name + "_" + str(
                        ctg_num) + ".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1:  # flip record
                        record = record.reverse_complement(id=True,
                                                           name=True,
                                                           annotations=True,
                                                           description=True)
                    ctg_list.append(record)
                else:  # workaround for having 0 value leftover from stub
                    pass  # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try:
                    ctg_num = match.group(1)
                except Exception:
                    ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name + " scaffold from " + ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100])  # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100])  # rm last bumper
            print ""
Exemple #4
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator,
                    genomes, run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/"
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root+g_name+"/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root+g_name+"/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas"
        scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1, dtype=[('ctg', 'i4'),
                                           ('start', 'i4'),
                                           ('end', 'i4'),
                                           ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir+ctg_num+".mauve"
            bb_file = mauve_file+".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(anchors_array, 0,
                                          (ctg_num,
                                           anchor_seg['start'],
                                           anchor_seg['end'],
                                           anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1 # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1: # flip record
                        record = record.reverse_complement(id=True, name=True,
                            annotations=True, description=True)
                    ctg_list.append(record)
                else: # workaround for having 0 value leftover from stub
                    pass # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try: ctg_num = match.group(1)
                except Exception: ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name+" scaffold from "+ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            print ""
Exemple #5
0
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes,
                  mauve_exec, max_size, chop_mode, mtype):
    """Align contigs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/"
    segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/contigs/"
    q_ctgs_root = run_root + run_dirs['match_out_dir'] + ref_n + "/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs and outputs
        g_name = genome['name']
        ctgs_fas_dir = q_ctgs_root + g_name + "/"
        mauve_dir = mauve_root + g_name + "/"
        aln_segs_root = segments_root + g_name + "/"
        ensure_dir([mauve_dir])
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_fas_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                q_contig = ctgs_fas_dir + item
                file_list = (ref_ctg_file, q_contig)
                mauve_outfile = mauve_dir + ctg_num + ".mauve"
                aln_segs_dir = aln_segs_root + ctg_num + "/"
                ensure_dir([aln_segs_dir])
                segfile = aln_segs_dir + ctg_num + "_" + ref_n + "_segs.txt"
                open(segfile, 'w').write('')
                # do Mauve alignment
                try:
                    open(ref_ctg_file, 'r')
                    open(q_contig, 'r')
                except IOError:
                    msg = "\nERROR: File missing, cannot align\n\t\t\t"
                    run_ref.log(msg)
                    print msg
                else:
                    align_mauve(file_list, mauve_outfile, mauve_exec)
                    try:
                        # parse Mauve output (without initial clumping)
                        coords = mauver_load2_k0(mauve_outfile + ".backbone",
                                                 0, mtype)
                        # chop segments that are too long
                        chop_array = chop_rows(coords, max_size, chop_mode,
                                               mtype)
                        # make detailed pairwise alignments of the segments
                        ref_rec = load_genbank(ref_ctg_file)
                        query_rec = load_fasta(q_contig)
                        iter_align(chop_array, ref_rec, query_rec,
                                   aln_segs_dir, segfile)
                    except IOError:
                        msg = "\nERROR: Mauve alignment failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
                    except Exception:
                        msg = "\nERROR: Iteration failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
        print ""
Exemple #6
0
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes,
                     max_size, chop_mode, mtype, mauve_exec):
    """Align constructs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/constructs/"
    segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/constructs/"
    scaff_root = run_root + run_dirs['scaffolds_dir'] + ref_n + "/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(
        ["\n\n# Align scaffold constructs to reference @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        scaff_gbk = scaff_root + g_name + "_" + ref_n + "_scaffold.gbk"
        file_list = (ref_ctg_file, scaff_gbk)
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root + g_name + "/"
        aln_segs_dir = segments_root + g_name + "/"
        ensure_dir([mauve_dir, aln_segs_dir])
        mauve_outfile = mauve_dir + g_name + "_" + ref_n + ".mauve"
        segfile = aln_segs_dir + g_name + "_" + ref_n + "_segs.txt"
        # abort if the reference file is not found
        try:
            open(ref_ctg_file, 'r')
        except IOError:
            msg = "ERROR: Reference file not found"
            print msg
            run_ref.log(msg)
            raise
        # abort if there is no scaffold construct
        try:
            open(scaff_gbk, 'r')
        except IOError:
            msg = "WARNING: No scaffold construct to align"
            print msg
            run_ref.log(msg)
        else:
            # prep segments file
            open(segfile, 'w').write('')
            # purge any pre-existing sslist file
            sslist_file = scaff_gbk + ".sslist"
            if os.path.isfile(sslist_file):
                try:
                    os.remove(sslist_file)
                except Exception:
                    raise
            # do Mauve alignment
            align_mauve(file_list, mauve_outfile, mauve_exec)
            try:
                # parse Mauve output (without initial clumping)
                coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype)
                print len(coords), '->',
                logstring = "".join(["\t", str(len(coords))])
                run_ref.log(logstring)
                # chop segments that are too long
                chop_array = chop_rows(coords, max_size, chop_mode, mtype)
                print len(chop_array), 'segments <', max_size, 'bp',
                logstring = "".join(["\t", str(len(chop_array))])
                run_ref.log(logstring)
                # make detailed pairwise alignments of the segments
                ref_rec = load_genbank(ref_ctg_file)
                query_rec = load_genbank(scaff_gbk)
                id = iter_align(chop_array, ref_rec, query_rec, aln_segs_dir,
                                segfile)
                print "@", id, "% id. overall"
                logstring = "".join(["\t", str(id)])
                run_ref.log(logstring)
            except IOError:
                msg = "\nERROR: Mauve alignment failed"
                run_ref.log(msg)
                print msg
Exemple #7
0
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs,
                  genomes, mauve_exec, max_size, chop_mode, mtype):
    """Align contigs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/contigs/"
    q_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs and outputs
        g_name = genome['name']
        ctgs_fas_dir = q_ctgs_root+g_name+"/"
        mauve_dir = mauve_root+g_name+"/"
        aln_segs_root = segments_root+g_name+"/"
        ensure_dir([mauve_dir])
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_fas_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                q_contig = ctgs_fas_dir+item
                file_list = (ref_ctg_file, q_contig)
                mauve_outfile = mauve_dir+ctg_num+".mauve"
                aln_segs_dir = aln_segs_root+ctg_num+"/"
                ensure_dir([aln_segs_dir])
                segfile = aln_segs_dir+ctg_num+"_"+ref_n+"_segs.txt"
                open(segfile, 'w').write('')
                # do Mauve alignment
                try:
                    open(ref_ctg_file, 'r')
                    open(q_contig, 'r')
                except IOError:
                    msg = "\nERROR: File missing, cannot align\n\t\t\t"
                    run_ref.log(msg)
                    print msg
                else:
                    align_mauve(file_list, mauve_outfile, mauve_exec)
                    try:
                        # parse Mauve output (without initial clumping)
                        coords = mauver_load2_k0(mauve_outfile+".backbone",
                                                 0, mtype)
                        # chop segments that are too long
                        chop_array = chop_rows(coords, max_size, chop_mode,
                                               mtype)
                        # make detailed pairwise alignments of the segments
                        ref_rec = load_genbank(ref_ctg_file)
                        query_rec = load_fasta(q_contig)
                        iter_align(chop_array, ref_rec, query_rec,
                                   aln_segs_dir, segfile)
                    except IOError:
                        msg = "\nERROR: Mauve alignment failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
                    except Exception:
                        msg = "\nERROR: Iteration failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
        print ""
Exemple #8
0
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs,
                     genomes, max_size, chop_mode, mtype, mauve_exec):
    """Align constructs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/constructs/"
    segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/constructs/"
    scaff_root = run_root+run_dirs['scaffolds_dir']+ref_n+"/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align scaffold constructs to reference @",
                         timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        scaff_gbk = scaff_root+g_name+"_"+ref_n+"_scaffold.gbk"
        file_list = (ref_ctg_file, scaff_gbk)
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root+g_name+"/"
        aln_segs_dir = segments_root+g_name+"/"
        ensure_dir([mauve_dir, aln_segs_dir])
        mauve_outfile = mauve_dir+g_name+"_"+ref_n+".mauve"
        segfile = aln_segs_dir+g_name+"_"+ref_n+"_segs.txt"
        # abort if the reference file is not found
        try: open(ref_ctg_file, 'r')
        except IOError:
            msg = "ERROR: Reference file not found"
            print msg
            run_ref.log(msg)
            raise
        # abort if there is no scaffold construct
        try: open(scaff_gbk, 'r')
        except IOError:
            msg = "WARNING: No scaffold construct to align"
            print msg
            run_ref.log(msg)
        else:
            # prep segments file
            open(segfile, 'w').write('')
            # purge any pre-existing sslist file
            sslist_file = scaff_gbk+".sslist"
            if os.path.isfile(sslist_file):
                try: os.remove(sslist_file)
                except Exception: raise
            # do Mauve alignment
            align_mauve(file_list, mauve_outfile, mauve_exec)
            try:
                # parse Mauve output (without initial clumping)
                coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype)
                print len(coords), '->',
                logstring = "".join(["\t", str(len(coords))])
                run_ref.log(logstring)
                # chop segments that are too long
                chop_array = chop_rows(coords, max_size, chop_mode, mtype)
                print len(chop_array), 'segments <', max_size, 'bp',
                logstring = "".join(["\t", str(len(chop_array))])
                run_ref.log(logstring)
                # make detailed pairwise alignments of the segments
                ref_rec = load_genbank(ref_ctg_file)
                query_rec = load_genbank(scaff_gbk)
                id = iter_align(chop_array, ref_rec, query_rec,
                                aln_segs_dir, segfile)
                print "@", id, "% id. overall"
                logstring = "".join(["\t", str(id)])
                run_ref.log(logstring)
            except IOError:
                msg = "\nERROR: Mauve alignment failed"
                run_ref.log(msg)
                print msg