Ejemplo n.º 1
0
def annot_ref(ref_name, ctg_fas, prot_db_name, fixed_dirs, project_id,
              blast_prefs):
    """Annotate reference contig (predict ORFs and assign function)."""
    # locate the COG database
    prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name
    # set inputs and outputs
    g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir']+ref_name+"/"
    ctg_cds_root = fixed_dirs['ctg_cds_dir']+ref_name+"/"
    ctg_prot_root = fixed_dirs['ctg_prot_dir']+ref_name+"/"
    ctg_blast_root = fixed_dirs['ctg_blast_dir']+ref_name+"/"
    annot_trn_root = fixed_dirs['annot_trn_dir']
    ensure_dir([g_gbk_ctgs_root, ctg_cds_root, ctg_prot_root,
                ctg_blast_root, annot_trn_root])
    trn_file = annot_trn_root+ref_name+"_annot.trn"
    g_ctg_gbk = g_gbk_ctgs_root+ref_name+"_1.gbk"
    annot_gbk = ctg_cds_root+ref_name+"_1_cds.gbk"
    annot_aa = ctg_prot_root+ref_name+"_1_aa.fas"
    blast_out = ctg_blast_root+ref_name+"_1.xml"
    if path.exists(blast_out) and os.stat(blast_out)[6]==0:
        os.remove(blast_out)
    if not path.exists(g_ctg_gbk):
        l_tag_base = ref_name+"_1"
        record = annot_ctg(ctg_fas, ctg_fas, annot_gbk,
                           annot_aa, trn_file, prot_db,
                           blast_out, l_tag_base, blast_prefs)
        record.description = ref_name+"_re-annotated"
        record.name = ref_name+"_1"
        record.dbxrefs = ["Project: "+project_id+"/"+ref_name
                          +"-like backbones"]
        record.seq.alphabet = generic_dna
        write_genbank(g_ctg_gbk, record)
    else:
        record = load_genbank(g_ctg_gbk)
    return record
Ejemplo n.º 2
0
def fas2gbk(fas_file):
    """Convert a FastA file to Genbank format."""
    record = load_fasta(fas_file)
    gbk_file = fas_file[:fas_file.find('.fas')]+'.gbk'
#    record.name = rec_name
#    record.id = rec_name
    record.seq.alphabet = generic_dna
    write_genbank(gbk_file, record)
    return gbk_file
Ejemplo n.º 3
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes,
                    run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ctgs_root = run_root + run_dirs['run_gbk_ctgs_dir'] + ref_n + "/"
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/"
    scaffolds_dir = run_root + run_dirs['scaffolds_dir'] + ref_n + "/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root + g_name + "/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root + g_name + "/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.fas"
        scaff_gbk = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1,
                                 dtype=[('ctg', 'i4'), ('start', 'i4'),
                                        ('end', 'i4'), ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "(" + ctg_num + ")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)

                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "(" + ctg_num + ")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)

        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir + ctg_num + ".mauve"
            bb_file = mauve_file + ".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(
                    anchors_array, 0,
                    (ctg_num, anchor_seg['start'], anchor_seg['end'],
                     anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1  # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir + g_name + "_" + str(
                        ctg_num) + ".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1:  # flip record
                        record = record.reverse_complement(id=True,
                                                           name=True,
                                                           annotations=True,
                                                           description=True)
                    ctg_list.append(record)
                else:  # workaround for having 0 value leftover from stub
                    pass  # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try:
                    ctg_num = match.group(1)
                except Exception:
                    ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name + " scaffold from " + ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100])  # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100])  # rm last bumper
            print ""
Ejemplo n.º 4
0
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id,
                timestamp, prot_db_name, project_id):
    """Re-annotate contig and extract reference segments using coordinates."""
    # set inputs and outputs
    run_root = r_root_dir + run_id + "/"
    ref_name = ref['name']
    in_file = fixed_dirs['ori_g_dir'] + ref['file']
    seg_out_root = run_root + run_dirs['ref_seg_dir'] + ref_name + "/"
    gen_fas_root = fixed_dirs['fas_contigs_dir'] + ref_name + "/"
    if ref_annot_flag:
        ref_gbk = run_root + run_dirs[
            'ref_gbk_dir'] + ref_name + "_re-annot.gbk"
    else:  ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        ref_gbk = in_file
    ref_fas = run_root + run_dirs['ref_fas_dir'] + ref_name + ".fas"
    genome_fas = gen_fas_root + ref_name + "_1.fas"
    report_root = run_root + run_dirs['reports'] + ref_name + "/"
    ref_log = report_root + run_id + "_" + ref_name + "_log.txt"
    ensure_dir([seg_out_root, report_root, gen_fas_root])
    print " ", ref_name, "...",
    # initialize run_ref object
    run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'],
                        ref['capture'], ref_fas, ref_gbk, seg_out_root,
                        ref_log)
    # initialize reference log
    cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"]
    open(ref_log, 'w').write(" ".join(cl_header))
    # open record and ensure we have a fasta in the right place
    if not path.exists(ref_fas):
        if run_ref.input == 'fas':
            copyfile(in_file, ref_fas)
        elif run_ref.input == 'gbk':
            record = load_genbank(in_file)
            record.id = ref_name
            write_fasta(ref_fas, record)
        else:
            msg = "ERROR: Input not recognized for " + ref_name
            run_ref.log(msg)
            raise Exception(msg)
    # make a BLAST DB
    make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs)
    copyfile(ref_fas, genome_fas)
    # re-annotate ref contig
    if ref_annot_flag:
        record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs,
                           project_id)
    else:  ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        record = load_genbank(in_file)
    # load or generate segment definitions
    if run_ref.seg_mode == 'chop':
        run_ref.get_segs_from_chop(len(record.seq), ref['chop_size'])
    elif run_ref.seg_mode == 'list':
        run_ref.get_segs_from_list(ref['segs'])
    elif run_ref.seg_mode == 'feats':
        run_ref.get_segs_from_feats(ref['feat_type'])
    # extract segment sequences
    rec_annot = run_ref.extract_segs_seqs(record, seg_out_root)
    # write re-annotated reference sequence to file
    write_genbank(ref_gbk, rec_annot)
    # report results
    logstring = " ".join([str(len(run_ref.segs)), "segments"])
    print logstring
    run_ref.log(logstring)
    return run_ref
Ejemplo n.º 5
0
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs,
                run_id, timestamp, prot_db_name, project_id):
    """Re-annotate contig and extract reference segments using coordinates."""
    # set inputs and outputs
    run_root = r_root_dir+run_id+"/"
    ref_name = ref['name']
    in_file = fixed_dirs['ori_g_dir']+ref['file']
    seg_out_root = run_root+run_dirs['ref_seg_dir']+ref_name+"/"
    gen_fas_root = fixed_dirs['fas_contigs_dir']+ref_name+"/"
    if ref_annot_flag:
        ref_gbk = run_root+run_dirs['ref_gbk_dir']+ref_name+"_re-annot.gbk"
    else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        ref_gbk = in_file
    ref_fas = run_root+run_dirs['ref_fas_dir']+ref_name+".fas"
    genome_fas = gen_fas_root+ref_name+"_1.fas"
    report_root = run_root+run_dirs['reports']+ref_name+"/"
    ref_log = report_root+run_id+"_"+ref_name+"_log.txt"
    ensure_dir([seg_out_root, report_root, gen_fas_root])
    print " ", ref_name, "...",
    # initialize run_ref object
    run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'],
                        ref['capture'], ref_fas, ref_gbk, seg_out_root,
                        ref_log)
    # initialize reference log
    cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"]
    open(ref_log, 'w').write(" ".join(cl_header))
    # open record and ensure we have a fasta in the right place
    if not path.exists(ref_fas):
        if run_ref.input == 'fas':
            copyfile(in_file, ref_fas)
        elif run_ref.input == 'gbk':
            record = load_genbank(in_file)
            record.id = ref_name
            write_fasta(ref_fas, record)
        else:
            msg = "ERROR: Input not recognized for "+ref_name
            run_ref.log(msg)
            raise Exception(msg)
    # make a BLAST DB
    make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs)
    copyfile(ref_fas, genome_fas)
    # re-annotate ref contig
    if ref_annot_flag:
        record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs,
                           project_id)
    else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        record = load_genbank(in_file)
    # load or generate segment definitions
    if run_ref.seg_mode == 'chop':
        run_ref.get_segs_from_chop(len(record.seq), ref['chop_size'])
    elif run_ref.seg_mode == 'list':
        run_ref.get_segs_from_list(ref['segs'])
    elif run_ref.seg_mode == 'feats':
        run_ref.get_segs_from_feats(ref['feat_type'])
    # extract segment sequences
    rec_annot = run_ref.extract_segs_seqs(record, seg_out_root)
    # write re-annotated reference sequence to file
    write_genbank(ref_gbk, rec_annot)
    # report results
    logstring = " ".join([str(len(run_ref.segs)), "segments"])
    print logstring
    run_ref.log(logstring)
    return run_ref
Ejemplo n.º 6
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator,
                    genomes, run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/"
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root+g_name+"/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root+g_name+"/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas"
        scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1, dtype=[('ctg', 'i4'),
                                           ('start', 'i4'),
                                           ('end', 'i4'),
                                           ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir+ctg_num+".mauve"
            bb_file = mauve_file+".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(anchors_array, 0,
                                          (ctg_num,
                                           anchor_seg['start'],
                                           anchor_seg['end'],
                                           anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1 # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1: # flip record
                        record = record.reverse_complement(id=True, name=True,
                            annotations=True, description=True)
                    ctg_list.append(record)
                else: # workaround for having 0 value leftover from stub
                    pass # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try: ctg_num = match.group(1)
                except Exception: ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name+" scaffold from "+ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            print ""
Ejemplo n.º 7
0
def annot_genome_contigs(run_ref, prot_db_name, fixed_dirs, r_root_dir,
                         run_id, run_dirs, genomes, project_id, timestamp,
                         blast_prefs): 
    """Annotate genome contigs (predict ORFs and assign function)."""
    # locate the COG database
    prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name
    # TODO: add other DB / pfams?
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    fas_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    ctg_cds_root = fixed_dirs['ctg_cds_dir']
    ctg_prot_root = fixed_dirs['ctg_prot_dir']
    ctg_blast_root = fixed_dirs['ctg_blast_dir']
    g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir']
    r_gbk_ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/"
    annot_trn_root = fixed_dirs['annot_trn_dir']
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Annotate genome contigs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        fas_ctgs_dir = fas_ctgs_root+g_name+"/"
        g_file = fixed_dirs['ori_g_dir']+genome['file']
        print '\t', g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set output files
        training_file = annot_trn_root+g_name+"_annot.trn"
        # set output dirs
        ctg_cds_dir = ctg_cds_root+g_name+"/"
        ctg_prot_dir = ctg_prot_root+g_name+"/"
        ctg_blast_dir = ctg_blast_root+g_name+"/"
        g_gbk_ctgs_dir = g_gbk_ctgs_root+g_name+"/"
        r_gbk_ctgs_dir = r_gbk_ctgs_root+g_name+"/"
        ensure_dir([ctg_cds_dir, ctg_prot_dir, ctg_blast_dir,
                    g_gbk_ctgs_dir, r_gbk_ctgs_dir])
        # list fasta files in matches directory
        dir_contents = listdir(fas_ctgs_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                ctg_fas = fas_ctgs_dir+item
                g_ctg_gbk = g_gbk_ctgs_dir+g_name+"_"+ctg_num+".gbk"
                r_ctg_gbk = r_gbk_ctgs_dir+g_name+"_"+ctg_num+".gbk"
                annot_gbk = ctg_cds_dir+g_name+"_"+ctg_num+"_cds.gbk"
                annot_aa = ctg_prot_dir+g_name+"_"+ctg_num+"_aa.fas"
                blast_out = ctg_blast_dir+g_name+"_"+ctg_num+".xml"
                if path.exists(blast_out) and os.stat(blast_out)[6]==0:
                    os.remove(blast_out)
                if not path.exists(r_ctg_gbk):
                    if not path.exists(g_ctg_gbk):
                        l_tag_base = g_name+"_"+ctg_num
                        record = annot_ctg(g_file, ctg_fas, annot_gbk,
                                           annot_aa, training_file, prot_db,
                                           blast_out, l_tag_base, blast_prefs)
                        record.description = g_name+"_"+ctg_num
                        record.name = g_name+"_"+ctg_num
                        record.dbxrefs = ["Project: "+project_id+"/"+ref_n
                                          +"-like backbones"]
                        record.seq.alphabet = generic_dna
                        write_genbank(g_ctg_gbk, record)
                    copyfile(g_ctg_gbk, r_ctg_gbk)
        print ""
Ejemplo n.º 8
0
def batch_contig_annot(dataset):
    """Extract and annotate contigs."""
    # identify dataset contig file
    contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa'
    # locate the COG database
    cog_db = dirs['blast_db_dir']+'Cog_LE/Cog'
    # make the training file
    training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn'
    #train_prodigal(contigs_file, training_file)
    # set output dirs
    fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/'
    gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/'
    aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/'
    blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/'
    solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/'
    maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/'
    ensure_dir(fas_out_dir)
    ensure_dir(gbk_out_dir)
    ensure_dir(aa_out_dir)
    ensure_dir(blast_out_dir)
    ensure_dir(solid_out_dir)
    # set phage hit collector
    contig_hits = {}
    sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                   +dataset['f_nick']+'_kw_hits.html'
    all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                    +dataset['f_nick']+'_all_hits.html'
    sp_hit_list_handle = open(sp_hit_list, 'w')
    all_hit_list_handle = open(all_hit_list, 'w')
    sp_hit_list_handle.write("<ul>")
    all_hit_list_handle.write("<ul>")
    # load all contigs
    contigs_list = load_multifasta(contigs_file)
    # cycle through contigs
    ctg_count = 0
    gene_count = 0
    for contig in contigs_list:
        ctg_count +=1
        # use regex to acquire relevant record ID info
        pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)')
        match = pattern.match(contig.id)
        nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
        contig.id = nick
        fasta_out = fas_out_dir+nick+'.fas'
        # write record to file
        write_fasta(fasta_out, contig)
        # create contig entry in dict
        contig_hits[nick] = []
        # run the annotation
        annot_gbk = gbk_out_dir+nick+'.gbk'
        annot_aa = aa_out_dir+nick+'.fas'
        #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file)
        # blast the amino acids against COG
        print '\tblasting', dataset['f_nick'], nick
        blast_out = blast_out_dir+nick+'.xml'
        if path.isfile(blast_out):
            print "\t\talready blasted"
        else:
            local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs)
        # collect best hits
        rec_cogs = collect_cogs(blast_out)
        map_file = maps_out_dir+nick+'.pdf'
        # consolidate annotated genbank file
        record = load_fasta(fasta_out)
        aa_defs = load_multifasta(annot_aa)
        features = []
        counter = 1
        ctg_flag_1 = 0
        ctg_flag_2 = 0
        for protein in aa_defs:
            gene_count +=1
            # get feature details from description line
            # necessary because the prodigal output is not parser-friendly
            pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)')
            match = pattern.match(protein.description)
            start_pos = int(match.group(1))
            end_pos = int(match.group(2))
            strand_pos = int(match.group(3))
            feat_loc = FeatureLocation(start_pos, end_pos)
            annotation = rec_cogs['Query_'+str(counter)]
            if ctg_flag_1 is 0:
                all_hit_list_handle.write("</ul><br><a href='"
                                          +"../../../../"
                                          +map_file
                                          +"'>Contig "
                                          +nick+"</a><ul>")
                ctg_flag_1 = 1
            all_hit_list_handle.write("<li>"+str(counter)
                                            +'. '+annotation+"</li>")
            # detect phage content in annotation
            phi_pattern = re.compile(r".+(COG\d+).+"
                                      "(phage|capsid|muramidase|tail|"
                                      "replication|helicase|polymerase|"
                                      "integrase|recombinase"
                                      "suppressor|hydrolase|transposase).+",
                                     re.IGNORECASE)
            phi_match = phi_pattern.match(annotation)
            if phi_match:
                hit_flag = 'on'
                hit_dict = {'CDS': counter,
                            'annot': annotation,
                            'COGs': phi_match.group}
                contig_hits[nick].append(hit_dict)
                # write out to summary file
                if ctg_flag_2 is 0:
                    sp_hit_list_handle.write("</ul><br><a href='"
                                             +"../../../../"
                                             +map_file
                                             +"'>Contig "
                                             +nick+"</a><ul>")
                    ctg_flag_2 = 1
                sp_hit_list_handle.write("<li>"+str(counter)
                                          +'. '+annotation+"</li>")
            else:
                hit_flag = 'off'
            # consolidation feature annotations
            quals = {'note': protein.description,
                     'fct': annotation,
                     'flag': hit_flag}
            feature = SeqFeature(location=feat_loc,
                                 strand=strand_pos,
                                 id=protein.id,
                                 type='CDS',
                                 qualifiers=quals)
            features.append(feature)
            counter +=1
        record.features = features
        record.description = dataset['f_nick']+'_contig_'+nick
        record.name = nick
        record.dbxrefs = ['Project:np1']
        record.seq.alphabet = generic_dna
        gbk_out = solid_out_dir+nick+'.gbk'
        write_genbank(gbk_out, record)
        # generate graphical map
        ContigDraw(nick, gbk_out, map_file)
    sp_hit_list_handle.write("</ul>")
    all_hit_list_handle.write("</ul>")
    sp_hit_list_handle.close()
    all_hit_list_handle.close()
    print "\t", gene_count, "predicted genes in", ctg_count, "contigs"