Exemple #1
0
def mauve_pw_align(ref, query, dirs):
    """Set up and perform a pairwise alignment with Mauve."""
    # set outputs
    mauve_outfile = dirs['mauve']+ref.name+"_"+query.name+".mauve"
    segfile = dirs['aln_segs']+ref.name+"_"+query.name+"_segs.txt"
    # check for existing alignment
    if path.exists(segfile):
        print "already done"
    else:
        # prep segments file
        open(segfile, 'w').write('')
        # purge any pre-existing sslist files
        sslist_files = from_dir(dirs['seqfiles'], re.compile(r'.*\.sslist.*'))
        for sslist in sslist_files:
            try: os.remove(dirs['seqfiles']+sslist)
            except Exception: raise
        # do Mauve alignment
        file_list = [ref.gbk, query.gbk]
        align_mauve(file_list, mauve_outfile)
        try:
            # parse Mauve output (without initial clumping)
            coords = mauver_load2_k0(mauve_outfile+".backbone", 0)
            print "\nSegment results:", len(coords), '->',
            # chop segments that are too long
            chop_array = chop_rows(coords, max_size, chop_mode)
            print len(chop_array), 'segments <', max_size, 'bp'
            # make detailed pairwise alignments of the segments
            print "Aligning segments ..."
            ref_rec = load_genbank(ref.gbk)
            query_rec = load_genbank(query.gbk)
            id = iter_align(chop_array, ref_rec, query_rec,
                            dirs['aln_segs'], segfile)
            print "Results:", id, "% id. overall"
        except IOError:
            print "\nERROR: Mauve alignment failed"
Exemple #2
0
def annot_ref(ref_name, ctg_fas, prot_db_name, fixed_dirs, project_id,
              blast_prefs):
    """Annotate reference contig (predict ORFs and assign function)."""
    # locate the COG database
    prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name
    # set inputs and outputs
    g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir']+ref_name+"/"
    ctg_cds_root = fixed_dirs['ctg_cds_dir']+ref_name+"/"
    ctg_prot_root = fixed_dirs['ctg_prot_dir']+ref_name+"/"
    ctg_blast_root = fixed_dirs['ctg_blast_dir']+ref_name+"/"
    annot_trn_root = fixed_dirs['annot_trn_dir']
    ensure_dir([g_gbk_ctgs_root, ctg_cds_root, ctg_prot_root,
                ctg_blast_root, annot_trn_root])
    trn_file = annot_trn_root+ref_name+"_annot.trn"
    g_ctg_gbk = g_gbk_ctgs_root+ref_name+"_1.gbk"
    annot_gbk = ctg_cds_root+ref_name+"_1_cds.gbk"
    annot_aa = ctg_prot_root+ref_name+"_1_aa.fas"
    blast_out = ctg_blast_root+ref_name+"_1.xml"
    if path.exists(blast_out) and os.stat(blast_out)[6]==0:
        os.remove(blast_out)
    if not path.exists(g_ctg_gbk):
        l_tag_base = ref_name+"_1"
        record = annot_ctg(ctg_fas, ctg_fas, annot_gbk,
                           annot_aa, trn_file, prot_db,
                           blast_out, l_tag_base, blast_prefs)
        record.description = ref_name+"_re-annotated"
        record.name = ref_name+"_1"
        record.dbxrefs = ["Project: "+project_id+"/"+ref_name
                          +"-like backbones"]
        record.seq.alphabet = generic_dna
        write_genbank(g_ctg_gbk, record)
    else:
        record = load_genbank(g_ctg_gbk)
    return record
Exemple #3
0
def ContigDraw(cName, in_file, out_file):
    """Draw sequence map of a single contig to file."""
    # load contig record
    seq_record = load_genbank(in_file)
    ctg_length = len(seq_record.seq)
    features = seq_record.features
    feat_cnt = len(features)
    # calculate main canvas dimensions
    print "\tcalculating canvas dimensions"
    if ctg_length < 25000:
        hCan = 32*cm
    else:
        hCan = hmar*2 + pNsize + ctg_length*u
    vCan = dBL + vmar + feat_cnt*ck_vsp
    transX = hmar + pNsize
    transY = dBL + vmar/2 + feat_cnt*ck_vsp
    # set up main canvas
    canvas_main = Canvasser(hCan, vCan, transX, transY, out_file)
    print "\tdrawing contig baselines and features"
    # draw contig baseline and features
    BaseDraw(canvas_main, cName, ctg_length, features)
    # draw scale
    SeqScale(canvas_main, scX, incrT, incrN, dip, dop )
    # write to file and finalize the figure
    canvas_main.showPage()
    canvas_main.save()
    print "OK"
Exemple #4
0
def gbk2fas(gbk_file):
    """Convert a Genban file to kFastA format."""
    record = load_genbank(gbk_file)
    fas_file = gbk_file[:gbk_file.find('.gbk')]+'.fas'
#    record.name = rec_name
#    record.id = rec_name
    write_fasta(fas_file, record)
    return fas_file
Exemple #5
0
 def get_segs_from_feats(self, feat_type):
     feats = [feat for feat in load_genbank(self.gbk).features
              if feat.type == feat_type]
     counter = 0
     for feat in feats: # TODO: there must be a better way to do this !!!
         counter +=1
         a = int(str(feat.location.start))
         b = int(str(feat.location.end))
         feat_id = feat_type+'_'+str(counter)
         seg = {'coords': (a, b), 'strand': feat.strand, 'name': feat_id,
                'note': str(a)+'_'+str(b)}
         self.segs.append(seg)
Exemple #6
0
def mauve_pw_align(ref, query, r_root_dir, g_root_dir, dirs, run, max_size,
                   chop_mode, mauve_exec, mtype):
    """Set up and perform a pairwise alignment with Mauve."""
    aln_dir = r_root_dir + run + dirs['aln_segs']
    mauve_dir = r_root_dir + run + dirs['mauve']
    # set outputs
    mauve_outfile = mauve_dir + ref.name + "_" + query.name + ".mauve"
    segfile = aln_dir + ref.name + "_" + query.name + "_segs.txt"
    # check for existing alignment
    if path.exists(segfile):
        print "already done"
    else:
        # prep segments file
        open(segfile, 'w').write('')
        # purge any pre-existing sslist files
        sslist_files = from_dir(g_root_dir, re.compile(r'.*\.sslist.*'))
        for sslist in sslist_files:
            try:
                os.remove(g_root_dir + sslist)
            except Exception:
                raise
        # do Mauve alignment
        file_list = [ref.gbk, query.gbk]
        align_mauve(file_list, mauve_outfile, mauve_exec)
        try:
            # parse Mauve output (without initial clumping)
            coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype)
            print "\nSegment results:", len(coords), '->',
            # chop segments that are too long
            chop_array = chop_rows(coords, max_size, chop_mode, mtype)
            print len(chop_array), 'segments <', max_size, 'bp'
            # make detailed pairwise alignments of the segments
            print "Aligning segments ..."
            ref_rec = load_genbank(ref.gbk)
            query_rec = load_genbank(query.gbk)
            id = iter_align(chop_array, ref_rec, query_rec, aln_dir, segfile)
            print "Results:", id, "% id. overall"
        except IOError:
            print "\nERROR: Mauve alignment failed"
            raise
Exemple #7
0
def map_cst_aln(run_ref, ref_gbk, genome, scaff_gbk, segs_root, maps_root,
                segtype, min_size, fct_flags, fct_colors, idpt):
    """Generate map of construct aligned to reference."""
    # set inputs and outputs
    g_name = genome['name']
    ref_ctg_n = run_ref.name
    seg_file = segs_root + g_name + "/" + g_name + "_" + ref_ctg_n + "_segs.txt"
    map_file = maps_root + g_name + "_vs_" + ref_ctg_n + ".pdf"
    # start mapping
    try:
        open(scaff_gbk)
    except IOError:
        print "WARNING: No scaffold construct to map"
    else:
        try:
            # load segments TODO: add idp-based clumping
            segdata = np.loadtxt(seg_file, skiprows=1, dtype=segtype)
        except IOError:
            msg = "\nERROR: could not load segments data"
            run_ref.log(msg)
            print msg
        except StopIteration:
            msg = "\nERROR: could not make map"
            run_ref.log(msg)
            print msg
        else:
            # offset coordinates where desired
            try:
                g_offset = genome['offset']
                if g_offset[0] != 0 or g_offset[1] != 0:
                    q_len = len(load_genbank(scaff_gbk).seq)
                    segdata = offset_q2r_coords(segdata, q_len, g_offset,
                                                segtype)
                # determine whether to flip the query sequence (negative offset)
                if g_offset[1] < 0:
                    q_invert = True
                else:
                    q_invert = False
            except KeyError:
                g_offset = (0, 0)
                q_invert = False
            # generate graphical map
            pairwise_draw(ref_ctg_n, g_name, ref_gbk, scaff_gbk, segdata,
                          map_file, q_invert, g_offset, 'dual', 'dual', 'm',
                          'fct', 'fct', min_size, fct_flags, fct_colors, idpt)
Exemple #8
0
 def get_segs_from_feats(self, feat_type):
     feats = [
         feat for feat in load_genbank(self.gbk).features
         if feat.type == feat_type
     ]
     counter = 0
     for feat in feats:  # TODO: there must be a better way to do this !!!
         counter += 1
         a = int(str(feat.location.start))
         b = int(str(feat.location.end))
         feat_id = feat_type + '_' + str(counter)
         seg = {
             'coords': (a, b),
             'strand': feat.strand,
             'name': feat_id,
             'note': str(a) + '_' + str(b)
         }
         self.segs.append(seg)
Exemple #9
0
def map_cst_aln(run_ref, ref_gbk, genome, scaff_gbk, segs_root, maps_root,
                segtype, min_size, fct_flags, fct_colors, idpt):
    """Generate map of construct aligned to reference."""
    # set inputs and outputs
    g_name = genome['name']
    ref_ctg_n = run_ref.name
    seg_file = segs_root+g_name+"/"+g_name+"_"+ref_ctg_n+"_segs.txt"
    map_file = maps_root+g_name+"_vs_"+ref_ctg_n+".pdf"
    # start mapping
    try: open(scaff_gbk)
    except IOError:
        print "WARNING: No scaffold construct to map"
    else:
        try:
            # load segments TODO: add idp-based clumping
            segdata = np.loadtxt(seg_file, skiprows=1, dtype=segtype)
        except IOError:
                msg = "\nERROR: could not load segments data"
                run_ref.log(msg)
                print msg
        except StopIteration:
                msg = "\nERROR: could not make map"
                run_ref.log(msg)
                print msg
        else:
            # offset coordinates where desired
            try:
                g_offset = genome['offset']
                if g_offset[0] != 0 or g_offset[1] != 0:
                    q_len = len(load_genbank(scaff_gbk).seq)
                    segdata = offset_q2r_coords(segdata, q_len, g_offset,
                                                segtype)
                # determine whether to flip the query sequence (negative offset)
                if g_offset[1] < 0:
                    q_invert = True
                else:
                    q_invert = False
            except KeyError:
            	g_offset = (0,0)
                q_invert = False
            # generate graphical map
            pairwise_draw(ref_ctg_n, g_name, ref_gbk, scaff_gbk, segdata,
                         map_file, q_invert, g_offset, 'dual', 'dual', 'm',
                         'fct', 'fct', min_size, fct_flags, fct_colors, idpt)
Exemple #10
0
def contig_draw(cName, in_file, out_file, annot_mode, key, fct_flags,
                fct_colors):
    """Draw sequence map of a single contig to file."""
    # load contig record
    seq_record = load_genbank(in_file)
    ctg_len = len(seq_record.seq)
    feats = seq_record.features
    cds = [
        feature for feature in feats
        if feature.type == 'CDS' or feature.type == 'cds'
    ]
    if annot_mode == 'all':
        annot_cds = [len(cds)]
    else:
        try:
            annot_cds = [
                1 for feature in cds
                if feature.qualifiers.get(key)[0] != 'no match'
            ]
        except TypeError:
            annot_cds = []
    annot_cnt = sum(annot_cds)
    # calculate main canvas dimensions
    if ctg_len * u < 2000:
        seq_len = 2000
    else:
        seq_len = ctg_len * u
    hCan = hmar * 2 + pNsize + seq_len
    vCan = dBL + vmar * 4 + (annot_cnt / 2) * ck_vsp
    transX = hmar + pNsize
    transY = dBL + vmar * 2 + (annot_cnt / 2) * ck_vsp
    ctg_Y = vmar
    # set up main canvas
    canvas = canvasser(hCan, vCan, transX, transY, out_file)
    # draw contig baseline and features
    base_draw(canvas, cName, ctg_len, feats, key, -doLdn, ctg_Y, 0, 'single',
              annot_cnt, None, None, seq_len, annot_mode, fct_flags,
              fct_colors)
    # draw scale
    seq_scale(canvas, (ctg_len * u) - pNsize, incrT, incrN, dip, dop)
    # write to file and finalize the figure
    canvas.showPage()
    canvas.save()
Exemple #11
0
def contig_draw(cName, in_file, out_file, annot_mode, key, fct_flags,
                fct_colors):
    """Draw sequence map of a single contig to file."""
    # load contig record
    seq_record = load_genbank(in_file)
    ctg_len = len(seq_record.seq)
    feats = seq_record.features
    cds = [feature for feature in feats
           if feature.type == 'CDS' or feature.type == 'cds']
    if annot_mode == 'all':
        annot_cds = [len(cds)]
    else:
        try:
            annot_cds = [1 for feature in cds
                         if feature.qualifiers.get(key)[0] != 'no match']
        except TypeError:
            annot_cds = []
    annot_cnt = sum(annot_cds)
    # calculate main canvas dimensions
    if ctg_len*u < 2000:
        seq_len = 2000
    else:
        seq_len = ctg_len*u
    hCan = hmar*2 + pNsize + seq_len
    vCan = dBL + vmar*4 + (annot_cnt/2)*ck_vsp
    transX = hmar + pNsize
    transY = dBL + vmar*2 + (annot_cnt/2)*ck_vsp
    ctg_Y = vmar
    # set up main canvas
    canvas = canvasser(hCan, vCan, transX, transY, out_file)
    # draw contig baseline and features
    base_draw(canvas, cName, ctg_len, feats, key, -doLdn, ctg_Y, 0, 'single',
             annot_cnt, None, None, seq_len, annot_mode, fct_flags,
             fct_colors)
    # draw scale
    seq_scale(canvas, (ctg_len*u)-pNsize, incrT, incrN, dip, dop)
    # write to file and finalize the figure
    canvas.showPage()
    canvas.save()
Exemple #12
0
def map_ref_segs(run_ref, run_id, r_root_dir, run_dirs, min_size,
                 fct_flags, fct_colors, idpt): 
    """Generate map of reference contig with segment details.

    This provides a comparison of the original reference and the
    re-annotated version.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ori_file = run_ref.file
    ref_maps_root = run_root+run_dirs['ref_map_dir']
    ensure_dir([ref_maps_root])
    gbk_file = run_root+run_dirs['ref_gbk_dir']+ref_n+"_re-annot.gbk"
    map_file = ref_maps_root+ref_n+"_ref.pdf"
    # start mapping
    try:
        # make mock segment, full-length with 100% id
        record = load_genbank(gbk_file)
        length = len(record.seq)
        segdata = [[1, length, 1, length, 100]]
        # deactivate offsetting
        g_offset = (0,0)
        q_invert = False
        # generate graphical map
        pairwise_draw(ref_n+"_ra", ref_n+"_ori", gbk_file, ori_file,
                     segdata, map_file, q_invert, g_offset, 'dual', 'dual',
                     'm', 'fct', 'product', min_size, fct_flags,
                     fct_colors, idpt)
    except IOError:
        msg = "\nERROR: could not load segments data"
        run_ref.log(msg)
        print msg
    except StopIteration:
        msg = "\nERROR: could not make map"
        run_ref.log(msg)
        print msg
Exemple #13
0
def map_ref_segs(run_ref, run_id, r_root_dir, run_dirs, min_size, fct_flags,
                 fct_colors, idpt):
    """Generate map of reference contig with segment details.

    This provides a comparison of the original reference and the
    re-annotated version.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ori_file = run_ref.file
    ref_maps_root = run_root + run_dirs['ref_map_dir']
    ensure_dir([ref_maps_root])
    gbk_file = run_root + run_dirs['ref_gbk_dir'] + ref_n + "_re-annot.gbk"
    map_file = ref_maps_root + ref_n + "_ref.pdf"
    # start mapping
    try:
        # make mock segment, full-length with 100% id
        record = load_genbank(gbk_file)
        length = len(record.seq)
        segdata = [[1, length, 1, length, 100]]
        # deactivate offsetting
        g_offset = (0, 0)
        q_invert = False
        # generate graphical map
        pairwise_draw(ref_n + "_ra", ref_n + "_ori", gbk_file, ori_file,
                      segdata, map_file, q_invert, g_offset, 'dual', 'dual',
                      'm', 'fct', 'product', min_size, fct_flags, fct_colors,
                      idpt)
    except IOError:
        msg = "\nERROR: could not load segments data"
        run_ref.log(msg)
        print msg
    except StopIteration:
        msg = "\nERROR: could not make map"
        run_ref.log(msg)
        print msg
Exemple #14
0
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs,
                run_id, timestamp, prot_db_name, project_id):
    """Re-annotate contig and extract reference segments using coordinates."""
    # set inputs and outputs
    run_root = r_root_dir+run_id+"/"
    ref_name = ref['name']
    in_file = fixed_dirs['ori_g_dir']+ref['file']
    seg_out_root = run_root+run_dirs['ref_seg_dir']+ref_name+"/"
    gen_fas_root = fixed_dirs['fas_contigs_dir']+ref_name+"/"
    if ref_annot_flag:
        ref_gbk = run_root+run_dirs['ref_gbk_dir']+ref_name+"_re-annot.gbk"
    else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        ref_gbk = in_file
    ref_fas = run_root+run_dirs['ref_fas_dir']+ref_name+".fas"
    genome_fas = gen_fas_root+ref_name+"_1.fas"
    report_root = run_root+run_dirs['reports']+ref_name+"/"
    ref_log = report_root+run_id+"_"+ref_name+"_log.txt"
    ensure_dir([seg_out_root, report_root, gen_fas_root])
    print " ", ref_name, "...",
    # initialize run_ref object
    run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'],
                        ref['capture'], ref_fas, ref_gbk, seg_out_root,
                        ref_log)
    # initialize reference log
    cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"]
    open(ref_log, 'w').write(" ".join(cl_header))
    # open record and ensure we have a fasta in the right place
    if not path.exists(ref_fas):
        if run_ref.input == 'fas':
            copyfile(in_file, ref_fas)
        elif run_ref.input == 'gbk':
            record = load_genbank(in_file)
            record.id = ref_name
            write_fasta(ref_fas, record)
        else:
            msg = "ERROR: Input not recognized for "+ref_name
            run_ref.log(msg)
            raise Exception(msg)
    # make a BLAST DB
    make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs)
    copyfile(ref_fas, genome_fas)
    # re-annotate ref contig
    if ref_annot_flag:
        record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs,
                           project_id)
    else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        record = load_genbank(in_file)
    # load or generate segment definitions
    if run_ref.seg_mode == 'chop':
        run_ref.get_segs_from_chop(len(record.seq), ref['chop_size'])
    elif run_ref.seg_mode == 'list':
        run_ref.get_segs_from_list(ref['segs'])
    elif run_ref.seg_mode == 'feats':
        run_ref.get_segs_from_feats(ref['feat_type'])
    # extract segment sequences
    rec_annot = run_ref.extract_segs_seqs(record, seg_out_root)
    # write re-annotated reference sequence to file
    write_genbank(ref_gbk, rec_annot)
    # report results
    logstring = " ".join([str(len(run_ref.segs)), "segments"])
    print logstring
    run_ref.log(logstring)
    return run_ref
Exemple #15
0
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds):
    """Unpack genome files.

    Here, unpacking means extracting data and producing specific files to
    standardize how the information is made available to downstream analysis.
    Depending on the input file format, different unpacking methods are
    invoked. In all cases, this ensures that for each genome, there is a
    multifasta file of the contigs all together as well as a separate Genbank
    file for each contig.

    Supported input file formats are the following:
    - mfas: Basic whole genome sequence in multifasta file of contigs. This
    can be used to process a finished genome in a single Fasta file as well.
    - cgbk: All contigs concatenated in a single GenBank file (Genoscope,
    French WGS). This can be used to process a finished genome in a single
    GanBank file as well.
    # TODO: provide support for other possible input formats

    Unpacking 'cgbk' genomes involves an initial step to detect occurrences
    of the sequence separator and collect the start and stop coordinates of
    each contig. Each pair of coordinates can then be used to extract the
    contig sequence and create a SeqRecord for that contig, which SeqIO
    normally does when it unpacks multifasta files.

    """
    # set up inputs
    infile = genome['file'] #TODO: make GUI input loader (upstream)
    inpath = fixed_dirs['ori_g_dir']+infile
    g_name = genome['name']
    print " ", g_name, "...",
    # prep output destinations
    mfas_dir = fixed_dirs['mfas_contigs_dir']
    fas_dir = fixed_dirs['fas_contigs_dir']+g_name+"/"
    ensure_dir([mfas_dir, fas_dir])
    mfas_file = mfas_dir+g_name+"_contigs.fas"
    records = []
    # select unpacking method
    if genome['input'] is 'fas':
        try: path.exists(inpath) is True
        except ValueError: raise Exception("Bad input file path")
        genome_recs = load_multifasta(inpath)
        # generate GenBank files
        counter = 0
        for rec in genome_recs:
            counter +=1
            ctg_num = str(counter)
            new_id = g_name+"_"+ctg_num  # workaround for long ids
            new_seq = rec.seq
            new_seq.alphabet = generic_dna
            new_rec = SeqRecord(seq=new_seq, id=new_id)
            records.append(new_rec)  # for multifasta output
            fas_file = fas_dir+new_id+".fas"
            write_fasta(fas_file, new_rec)
    elif genome['input'] is 'gbk':
        # load in genome data
        genome_rec = load_genbank(inpath)
        g_string = genome_rec.seq
        # find split coordinates
        coord_pairs = multisplit_finder(g_string, separator)
        # split record
        counter = 0
        for (start, stop) in coord_pairs:
            counter +=1
            ctg_num = str(counter)
            new_record = genome_rec[start:stop]
            new_record.id = g_name+"_"+ctg_num
            records.append(new_record)  # for multifasta output
            fas_file = fas_dir+g_name+"_"+ctg_num+".fas"
            write_fasta(fas_file, new_record)
    else:
        xmsg = "Input file format "+genome['input']+" unspecified/unsupported"
        raise Exception(xmsg)
    print counter, "contigs"
    # write master file
    write_fasta(mfas_file, records)
    # pass records to stats logger
    ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
Exemple #16
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator,
                    genomes, run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/"
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root+g_name+"/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root+g_name+"/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas"
        scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1, dtype=[('ctg', 'i4'),
                                           ('start', 'i4'),
                                           ('end', 'i4'),
                                           ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir+ctg_num+".mauve"
            bb_file = mauve_file+".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(anchors_array, 0,
                                          (ctg_num,
                                           anchor_seg['start'],
                                           anchor_seg['end'],
                                           anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1 # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1: # flip record
                        record = record.reverse_complement(id=True, name=True,
                            annotations=True, description=True)
                    ctg_list.append(record)
                else: # workaround for having 0 value leftover from stub
                    pass # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try: ctg_num = match.group(1)
                except Exception: ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name+" scaffold from "+ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            print ""
Exemple #17
0
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes,
                  mauve_exec, max_size, chop_mode, mtype):
    """Align contigs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/"
    segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/contigs/"
    q_ctgs_root = run_root + run_dirs['match_out_dir'] + ref_n + "/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs and outputs
        g_name = genome['name']
        ctgs_fas_dir = q_ctgs_root + g_name + "/"
        mauve_dir = mauve_root + g_name + "/"
        aln_segs_root = segments_root + g_name + "/"
        ensure_dir([mauve_dir])
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_fas_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                q_contig = ctgs_fas_dir + item
                file_list = (ref_ctg_file, q_contig)
                mauve_outfile = mauve_dir + ctg_num + ".mauve"
                aln_segs_dir = aln_segs_root + ctg_num + "/"
                ensure_dir([aln_segs_dir])
                segfile = aln_segs_dir + ctg_num + "_" + ref_n + "_segs.txt"
                open(segfile, 'w').write('')
                # do Mauve alignment
                try:
                    open(ref_ctg_file, 'r')
                    open(q_contig, 'r')
                except IOError:
                    msg = "\nERROR: File missing, cannot align\n\t\t\t"
                    run_ref.log(msg)
                    print msg
                else:
                    align_mauve(file_list, mauve_outfile, mauve_exec)
                    try:
                        # parse Mauve output (without initial clumping)
                        coords = mauver_load2_k0(mauve_outfile + ".backbone",
                                                 0, mtype)
                        # chop segments that are too long
                        chop_array = chop_rows(coords, max_size, chop_mode,
                                               mtype)
                        # make detailed pairwise alignments of the segments
                        ref_rec = load_genbank(ref_ctg_file)
                        query_rec = load_fasta(q_contig)
                        iter_align(chop_array, ref_rec, query_rec,
                                   aln_segs_dir, segfile)
                    except IOError:
                        msg = "\nERROR: Mauve alignment failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
                    except Exception:
                        msg = "\nERROR: Iteration failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
        print ""
Exemple #18
0
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes,
                     max_size, chop_mode, mtype, mauve_exec):
    """Align constructs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/constructs/"
    segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/constructs/"
    scaff_root = run_root + run_dirs['scaffolds_dir'] + ref_n + "/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(
        ["\n\n# Align scaffold constructs to reference @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        scaff_gbk = scaff_root + g_name + "_" + ref_n + "_scaffold.gbk"
        file_list = (ref_ctg_file, scaff_gbk)
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root + g_name + "/"
        aln_segs_dir = segments_root + g_name + "/"
        ensure_dir([mauve_dir, aln_segs_dir])
        mauve_outfile = mauve_dir + g_name + "_" + ref_n + ".mauve"
        segfile = aln_segs_dir + g_name + "_" + ref_n + "_segs.txt"
        # abort if the reference file is not found
        try:
            open(ref_ctg_file, 'r')
        except IOError:
            msg = "ERROR: Reference file not found"
            print msg
            run_ref.log(msg)
            raise
        # abort if there is no scaffold construct
        try:
            open(scaff_gbk, 'r')
        except IOError:
            msg = "WARNING: No scaffold construct to align"
            print msg
            run_ref.log(msg)
        else:
            # prep segments file
            open(segfile, 'w').write('')
            # purge any pre-existing sslist file
            sslist_file = scaff_gbk + ".sslist"
            if os.path.isfile(sslist_file):
                try:
                    os.remove(sslist_file)
                except Exception:
                    raise
            # do Mauve alignment
            align_mauve(file_list, mauve_outfile, mauve_exec)
            try:
                # parse Mauve output (without initial clumping)
                coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype)
                print len(coords), '->',
                logstring = "".join(["\t", str(len(coords))])
                run_ref.log(logstring)
                # chop segments that are too long
                chop_array = chop_rows(coords, max_size, chop_mode, mtype)
                print len(chop_array), 'segments <', max_size, 'bp',
                logstring = "".join(["\t", str(len(chop_array))])
                run_ref.log(logstring)
                # make detailed pairwise alignments of the segments
                ref_rec = load_genbank(ref_ctg_file)
                query_rec = load_genbank(scaff_gbk)
                id = iter_align(chop_array, ref_rec, query_rec, aln_segs_dir,
                                segfile)
                print "@", id, "% id. overall"
                logstring = "".join(["\t", str(id)])
                run_ref.log(logstring)
            except IOError:
                msg = "\nERROR: Mauve alignment failed"
                run_ref.log(msg)
                print msg
Exemple #19
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes,
                    run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ctgs_root = run_root + run_dirs['run_gbk_ctgs_dir'] + ref_n + "/"
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/"
    scaffolds_dir = run_root + run_dirs['scaffolds_dir'] + ref_n + "/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root + g_name + "/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root + g_name + "/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.fas"
        scaff_gbk = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1,
                                 dtype=[('ctg', 'i4'), ('start', 'i4'),
                                        ('end', 'i4'), ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "(" + ctg_num + ")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)

                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "(" + ctg_num + ")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)

        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir + ctg_num + ".mauve"
            bb_file = mauve_file + ".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(
                    anchors_array, 0,
                    (ctg_num, anchor_seg['start'], anchor_seg['end'],
                     anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1  # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir + g_name + "_" + str(
                        ctg_num) + ".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1:  # flip record
                        record = record.reverse_complement(id=True,
                                                           name=True,
                                                           annotations=True,
                                                           description=True)
                    ctg_list.append(record)
                else:  # workaround for having 0 value leftover from stub
                    pass  # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try:
                    ctg_num = match.group(1)
                except Exception:
                    ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name + " scaffold from " + ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100])  # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100])  # rm last bumper
            print ""
Exemple #20
0
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs,
                     genomes, max_size, chop_mode, mtype, mauve_exec):
    """Align constructs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/constructs/"
    segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/constructs/"
    scaff_root = run_root+run_dirs['scaffolds_dir']+ref_n+"/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align scaffold constructs to reference @",
                         timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        scaff_gbk = scaff_root+g_name+"_"+ref_n+"_scaffold.gbk"
        file_list = (ref_ctg_file, scaff_gbk)
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root+g_name+"/"
        aln_segs_dir = segments_root+g_name+"/"
        ensure_dir([mauve_dir, aln_segs_dir])
        mauve_outfile = mauve_dir+g_name+"_"+ref_n+".mauve"
        segfile = aln_segs_dir+g_name+"_"+ref_n+"_segs.txt"
        # abort if the reference file is not found
        try: open(ref_ctg_file, 'r')
        except IOError:
            msg = "ERROR: Reference file not found"
            print msg
            run_ref.log(msg)
            raise
        # abort if there is no scaffold construct
        try: open(scaff_gbk, 'r')
        except IOError:
            msg = "WARNING: No scaffold construct to align"
            print msg
            run_ref.log(msg)
        else:
            # prep segments file
            open(segfile, 'w').write('')
            # purge any pre-existing sslist file
            sslist_file = scaff_gbk+".sslist"
            if os.path.isfile(sslist_file):
                try: os.remove(sslist_file)
                except Exception: raise
            # do Mauve alignment
            align_mauve(file_list, mauve_outfile, mauve_exec)
            try:
                # parse Mauve output (without initial clumping)
                coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype)
                print len(coords), '->',
                logstring = "".join(["\t", str(len(coords))])
                run_ref.log(logstring)
                # chop segments that are too long
                chop_array = chop_rows(coords, max_size, chop_mode, mtype)
                print len(chop_array), 'segments <', max_size, 'bp',
                logstring = "".join(["\t", str(len(chop_array))])
                run_ref.log(logstring)
                # make detailed pairwise alignments of the segments
                ref_rec = load_genbank(ref_ctg_file)
                query_rec = load_genbank(scaff_gbk)
                id = iter_align(chop_array, ref_rec, query_rec,
                                aln_segs_dir, segfile)
                print "@", id, "% id. overall"
                logstring = "".join(["\t", str(id)])
                run_ref.log(logstring)
            except IOError:
                msg = "\nERROR: Mauve alignment failed"
                run_ref.log(msg)
                print msg
Exemple #21
0
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs,
                  genomes, mauve_exec, max_size, chop_mode, mtype):
    """Align contigs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/contigs/"
    q_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs and outputs
        g_name = genome['name']
        ctgs_fas_dir = q_ctgs_root+g_name+"/"
        mauve_dir = mauve_root+g_name+"/"
        aln_segs_root = segments_root+g_name+"/"
        ensure_dir([mauve_dir])
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_fas_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                q_contig = ctgs_fas_dir+item
                file_list = (ref_ctg_file, q_contig)
                mauve_outfile = mauve_dir+ctg_num+".mauve"
                aln_segs_dir = aln_segs_root+ctg_num+"/"
                ensure_dir([aln_segs_dir])
                segfile = aln_segs_dir+ctg_num+"_"+ref_n+"_segs.txt"
                open(segfile, 'w').write('')
                # do Mauve alignment
                try:
                    open(ref_ctg_file, 'r')
                    open(q_contig, 'r')
                except IOError:
                    msg = "\nERROR: File missing, cannot align\n\t\t\t"
                    run_ref.log(msg)
                    print msg
                else:
                    align_mauve(file_list, mauve_outfile, mauve_exec)
                    try:
                        # parse Mauve output (without initial clumping)
                        coords = mauver_load2_k0(mauve_outfile+".backbone",
                                                 0, mtype)
                        # chop segments that are too long
                        chop_array = chop_rows(coords, max_size, chop_mode,
                                               mtype)
                        # make detailed pairwise alignments of the segments
                        ref_rec = load_genbank(ref_ctg_file)
                        query_rec = load_fasta(q_contig)
                        iter_align(chop_array, ref_rec, query_rec,
                                   aln_segs_dir, segfile)
                    except IOError:
                        msg = "\nERROR: Mauve alignment failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
                    except Exception:
                        msg = "\nERROR: Iteration failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
        print ""
Exemple #22
0
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id,
                timestamp, prot_db_name, project_id):
    """Re-annotate contig and extract reference segments using coordinates."""
    # set inputs and outputs
    run_root = r_root_dir + run_id + "/"
    ref_name = ref['name']
    in_file = fixed_dirs['ori_g_dir'] + ref['file']
    seg_out_root = run_root + run_dirs['ref_seg_dir'] + ref_name + "/"
    gen_fas_root = fixed_dirs['fas_contigs_dir'] + ref_name + "/"
    if ref_annot_flag:
        ref_gbk = run_root + run_dirs[
            'ref_gbk_dir'] + ref_name + "_re-annot.gbk"
    else:  ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        ref_gbk = in_file
    ref_fas = run_root + run_dirs['ref_fas_dir'] + ref_name + ".fas"
    genome_fas = gen_fas_root + ref_name + "_1.fas"
    report_root = run_root + run_dirs['reports'] + ref_name + "/"
    ref_log = report_root + run_id + "_" + ref_name + "_log.txt"
    ensure_dir([seg_out_root, report_root, gen_fas_root])
    print " ", ref_name, "...",
    # initialize run_ref object
    run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'],
                        ref['capture'], ref_fas, ref_gbk, seg_out_root,
                        ref_log)
    # initialize reference log
    cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"]
    open(ref_log, 'w').write(" ".join(cl_header))
    # open record and ensure we have a fasta in the right place
    if not path.exists(ref_fas):
        if run_ref.input == 'fas':
            copyfile(in_file, ref_fas)
        elif run_ref.input == 'gbk':
            record = load_genbank(in_file)
            record.id = ref_name
            write_fasta(ref_fas, record)
        else:
            msg = "ERROR: Input not recognized for " + ref_name
            run_ref.log(msg)
            raise Exception(msg)
    # make a BLAST DB
    make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs)
    copyfile(ref_fas, genome_fas)
    # re-annotate ref contig
    if ref_annot_flag:
        record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs,
                           project_id)
    else:  ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        record = load_genbank(in_file)
    # load or generate segment definitions
    if run_ref.seg_mode == 'chop':
        run_ref.get_segs_from_chop(len(record.seq), ref['chop_size'])
    elif run_ref.seg_mode == 'list':
        run_ref.get_segs_from_list(ref['segs'])
    elif run_ref.seg_mode == 'feats':
        run_ref.get_segs_from_feats(ref['feat_type'])
    # extract segment sequences
    rec_annot = run_ref.extract_segs_seqs(record, seg_out_root)
    # write re-annotated reference sequence to file
    write_genbank(ref_gbk, rec_annot)
    # report results
    logstring = " ".join([str(len(run_ref.segs)), "segments"])
    print logstring
    run_ref.log(logstring)
    return run_ref
Exemple #23
0
def pairwise_draw(ref, query, segs, map_file, mode1, mode2, annot_mode,
                  key1, key2, idpt, fct_flags, fct_colors, min_size):
    """Draw pairwise alignment map with similarity shading."""
    # load ref record
    ref_record = load_genbank(ref.gbk)
    ref_feat = ref_record.features
    ref_cds = [feature for feature in ref_feat
               if feature.type == 'CDS' or feature.type == 'cds']
    if annot_mode != 'all':
        try:
            ref_annot_cds = [1 for cds in ref_cds
                             if cds.qualifiers.get(key1)[0] !=
                                'hypothetical protein' and \
                                cds.qualifiers.get(key1)[0] !=
                                'no match']
        except TypeError:
            ref_annot_cds = []
        ref_annot_cnt = sum(ref_annot_cds)
    else:
        ref_annot_cnt = len(ref_cds)
    # load query record
    query_record = load_genbank(query.gbk)
    if query.invert:
        query_record = query_record.reverse_complement()
    q_feat = query_record.features
    query_cds = [feature for feature in q_feat
                 if feature.type == 'CDS' or feature.type == 'cds']
    if annot_mode != 'all':
        try:
            query_annot_cds = [1 for cds in query_cds
                               if cds.qualifiers.get(key2)[0] !=
                                'hypothetical protein' and \
                                cds.qualifiers.get(key2)[0] !=
                                'no match']
        except TypeError:
            query_annot_cds = []
        query_annot_cnt = sum(query_annot_cds)
    else:
        query_annot_cnt = len(query_cds)
    # calculate main canvas dimensions - horizontal
    if ref.len+ref.nudge > query.len:
        ctg_len = ref.len+ref.nudge
    else:
        ctg_len = query.len
    if ctg_len*u < 2000:
        seq_len = 2000
    else:
        seq_len = ctg_len*u
    hCan = hmar*2 + pNsize + seq_len
    # calculate main canvas dimensions - vertical
    if mode1 == 'single' and mode2 == 'n':
        annot_cnt = ref_annot_cnt
        annot_len = annot_cnt/2
    else:
        annot_cnt = max(ref_annot_cnt, query_annot_cnt)
        annot_len = annot_cnt
    vCan = dBL + vmar*6 + annot_len*ck_vsp
    transX = hmar + pNsize
    transY = dBL + vmar*1.8 + annot_len*ck_vsp
    ref_Y = vmar*2.8
    query_Y = vmar
    # set up main canvas
    m_canvas = canvasser(hCan, vCan, transX, transY, map_file)
    # draw scale
    seq_scale(m_canvas, (ctg_len*u)-pNsize, 0, incrT, incrN, dip, dop )
    # draw shading legend
    heatkey(m_canvas, -pNsize, -pNsize/2, idpt)
    # draw ref baseline and features
    base_draw(m_canvas, ref, ref_feat, key1, doLup, ref_Y, 0,
              mode1, annot_cnt, seq_len, annot_mode, 'top', fct_flags, fct_colors)
    # draw query baseline and features
    base_draw(m_canvas, query, q_feat, key2, -doLdn, query_Y, seq_len/2,
              mode2, annot_cnt, seq_len, annot_mode, 'low', fct_flags, fct_colors)
    # draw pairwise similarity shading
    try:
        for xa, xb, xc, xd, idp in segs:
            # evaluate color shading category
            sh_color = HexColor(simcolor(idp, idpt))
            # check for split
            if abs(xa) > abs(xb) or abs(xc) > abs(xd):
                new_segpairs = shade_split(xa, xb, xc, xd, ref, query)
                for xa1, xb1, xc1, xd1 in new_segpairs:
                    # draw shading
                    shadowfax(m_canvas, xa1, xb1, xc1, xd1, ref_Y, query_Y, sh_color, min_size)
            else:
                # draw shading
                shadowfax(m_canvas, xa, xb, xc, xd, ref_Y, query_Y, sh_color, min_size)
    except TypeError:
        raise
    # write to file and finalize the figure
    m_canvas.showPage()
    m_canvas.save()
Exemple #24
0
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds):
    """Unpack genome files.

    Here, unpacking means extracting data and producing specific files to
    standardize how the information is made available to downstream analysis.
    Depending on the input file format, different unpacking methods are
    invoked. In all cases, this ensures that for each genome, there is a
    multifasta file of the contigs all together as well as a separate Genbank
    file for each contig.

    Supported input file formats are the following:
    - mfas: Basic whole genome sequence in multifasta file of contigs. This
    can be used to process a finished genome in a single Fasta file as well.
    - cgbk: All contigs concatenated in a single GenBank file (Genoscope,
    French WGS). This can be used to process a finished genome in a single
    GanBank file as well.
    # TODO: provide support for other possible input formats

    Unpacking 'cgbk' genomes involves an initial step to detect occurrences
    of the sequence separator and collect the start and stop coordinates of
    each contig. Each pair of coordinates can then be used to extract the
    contig sequence and create a SeqRecord for that contig, which SeqIO
    normally does when it unpacks multifasta files.

    """
    # set up inputs
    infile = genome['file']  #TODO: make GUI input loader (upstream)
    inpath = fixed_dirs['ori_g_dir'] + infile
    g_name = genome['name']
    print " ", g_name, "...",
    # prep output destinations
    mfas_dir = fixed_dirs['mfas_contigs_dir']
    fas_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/"
    ensure_dir([mfas_dir, fas_dir])
    mfas_file = mfas_dir + g_name + "_contigs.fas"
    records = []
    # select unpacking method
    if genome['input'] is 'fas':
        try:
            path.exists(inpath) is True
        except ValueError:
            raise Exception("Bad input file path")
        genome_recs = load_multifasta(inpath)
        # generate GenBank files
        counter = 0
        for rec in genome_recs:
            counter += 1
            ctg_num = str(counter)
            new_id = g_name + "_" + ctg_num  # workaround for long ids
            new_seq = rec.seq
            new_seq.alphabet = generic_dna
            new_rec = SeqRecord(seq=new_seq, id=new_id)
            records.append(new_rec)  # for multifasta output
            fas_file = fas_dir + new_id + ".fas"
            write_fasta(fas_file, new_rec)
    elif genome['input'] is 'gbk':
        # load in genome data
        genome_rec = load_genbank(inpath)
        g_string = genome_rec.seq
        # find split coordinates
        coord_pairs = multisplit_finder(g_string, separator)
        # split record
        counter = 0
        for (start, stop) in coord_pairs:
            counter += 1
            ctg_num = str(counter)
            new_record = genome_rec[start:stop]
            new_record.id = g_name + "_" + ctg_num
            records.append(new_record)  # for multifasta output
            fas_file = fas_dir + g_name + "_" + ctg_num + ".fas"
            write_fasta(fas_file, new_record)
    else:
        xmsg = "Input file format " + genome[
            'input'] + " unspecified/unsupported"
        raise Exception(xmsg)
    print counter, "contigs"
    # write master file
    write_fasta(mfas_file, records)
    # pass records to stats logger
    ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
Exemple #25
0
def multi_draw(g_pairs, segdata_list, mapfile, idpt, fct_flags, fct_colors, min_size):
    """Draw multiple alignment map with similarity shading."""
    print "Lookin\' good!"
    # compile info
    lengths = [g_pairs[0][0].len+g_pairs[0][0].nudge]
    g_to_draw = [g_pairs[0][0]]
    for ref, query in g_pairs:
        lengths.append(query.len+query.nudge)
        g_to_draw.append(query)
    max_len = max(lengths)
    # calculate main canvas dimensions - horizontal
    if max_len*u < 2000:
        seq_len = 2000
    else:
        seq_len = max_len*u
    hCan = hmar*4 + pNsize + seq_len
    # calculate main canvas dimensions - vertical
    vCan = dBL*len(g_pairs) + vmar*4
    transX = hmar + pNsize
    transY = dBL*len(g_pairs)
    init_Y = vmar*2
    # set up main canvas
    m_canvas = canvasser(hCan, vCan, transX, transY, mapfile)
    # draw scale (max_len*u)-pNsize, hmar
    seq_scale(m_canvas, 2*hCan/3, -vmar*2, incrT, incrN, dip, dop)
    # draw shading legend
    heatkey(m_canvas, hCan-hmar*5, init_Y+vmar, idpt)
    # draw ref baseline and features
    counter = 0
    for genome in g_to_draw:
        g_record = load_genbank(genome.gbk)
        g_feat = g_record.features
        g_cds = [feature for feature in g_feat
                   if feature.type == 'CDS' or feature.type == 'cds']
        ref_Y = init_Y-dBL*counter
        base_draw(m_canvas, genome, g_cds, '', doLup, ref_Y, 0, 'n', 0,
                  seq_len, 'n', 'n', fct_flags, fct_colors)
        counter +=1
    counter = 0
    for ref, query in g_pairs:
        ref_Y = init_Y-dBL*counter
        query_Y = init_Y-dBL*(counter+1)
        # draw pairwise similarity shading
        try:                                  # TODO: adapt Y
            for xa, xb, xc, xd, idp in segdata_list[counter]:
                # evaluate color shading category
                sh_color = HexColor(simcolor(idp, idpt))
                # check for split
                if abs(xa) > abs(xb) or abs(xc) > abs(xd):
                    new_segpairs = shade_split(xa, xb, xc, xd, ref, query)
                    for xa1, xb1, xc1, xd1 in new_segpairs:
                        # draw shading
                        shadowfax(m_canvas, xa1, xb1, xc1, xd1, ref_Y,
                                  query_Y, sh_color, min_size)
                else:
                    # draw shading
                    shadowfax(m_canvas, xa, xb, xc, xd, ref_Y, query_Y,
                              sh_color, min_size)
            counter +=1
        except TypeError:
            pass
    # write to file and finalize the figure
    m_canvas.showPage()
    m_canvas.save()