コード例 #1
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for seq_num, (ind, seq) in enumerate(read_fasta(fasta_fpath)):
        seq_num = str(seq_num)
        ind = ind[:qutils.MAX_CONTIG_NAME_GLIMMER]
        contig_path = os.path.join(base_dir, seq_num + '.fasta')
        gff_path = os.path.join(base_dir, seq_num + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        return None, None, None, None

    out_gff_fpath = out_fpath + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')
    out_gff_path = merge_gffs(gffs, out_gff_fpath)
    unique, total = set(), 0
    genes = []
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start - 1:end]
        else:
            gene_seq = rev_comp(contigs[contig][start - 1:end])
        if gene_seq not in unique:
            unique.add(gene_seq)
        gene = Gene(contig=contig, start=start, end=end, strand=strand, seq=gene_seq)
        gene.is_full = gene.start > 1 and gene.end < len(contigs[contig])
        genes.append(gene)

    full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths]
    partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths]
    if OUTPUT_FASTA:
        out_fasta_fpath = out_fpath + '_genes.fasta'
        add_genes_to_fasta(genes, out_fasta_fpath)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, genes, len(unique), total, full_cnt, partial_cnt
コード例 #2
0
ファイル: glimmer.py プロジェクト: student-t/quast
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for seq_num, (ind, seq) in enumerate(read_fasta(fasta_fpath)):
        seq_num = str(seq_num)
        ind = ind[:qutils.MAX_CONTIG_NAME_GLIMMER]
        contig_path = os.path.join(base_dir, seq_num + '.fasta')
        gff_path = os.path.join(base_dir, seq_num + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        return None, None, None, None

    out_gff_fpath = out_fpath + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')
    out_gff_path = merge_gffs(gffs, out_gff_fpath)
    unique, total = set(), 0
    genes = []
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start - 1:end]
        else:
            gene_seq = rev_comp(contigs[contig][start - 1:end])
        if gene_seq not in unique:
            unique.add(gene_seq)
        gene = Gene(contig=contig, start=start, end=end, strand=strand, seq=gene_seq)
        gene.is_full = gene.start > 1 and gene.end < len(contigs[contig])
        genes.append(gene)

    full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths]
    partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths]
    if OUTPUT_FASTA:
        out_fasta_fpath = out_fpath + '_genes.fasta'
        add_genes_to_fasta(genes, out_fasta_fpath)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, genes, len(unique), total, full_cnt, partial_cnt
コード例 #3
0
ファイル: genemark.py プロジェクト: lucian-ilie/LASER
def parse_gmhmm_out(out_fpath):
    reading_gene = False
    reading_protein = False
    protein = ''
    with open(out_fpath) as f:
        for line in f:
            if line.startswith('>gene'):
                seq = []
                seq_id, contig_id = line.strip().split('\t')
                # >gene_2|GeneMark.hmm|57_nt|+|1|57	>NODE_3_length_713_cov_1.25228
                _, _, seq_len, strand, left_index, right_index = seq_id.split('|')
                contig_id = contig_id[1:]
                if 'nt' in seq_len:
                    reading_gene = True
                elif 'aa' in seq_len:
                    reading_protein = True
            elif reading_gene or reading_protein:
                if line.isspace():
                    left_index = int(left_index)
                    right_index = int(right_index)
                    if reading_gene:
                        seq = ''.join(seq)
                        reading_gene = False
                    elif reading_protein:
                        protein = ''.join(seq)
                        seq = []
                        reading_protein = False
                    #genes.append(Gene(contig_id, strand, left_index, right_index, str_seq))
                    if seq:
                        gene = Gene(contig=contig_id, start=left_index, end=right_index, strand=strand, seq=seq, protein=protein)
                        yield gene
                else:
                    seq.append(line.strip())
コード例 #4
0
ファイル: genemark.py プロジェクト: lucian-ilie/LASER
def parse_gtf_out(out_fpath):
    with open(out_fpath) as f:
        for line in f:
            if 'CDS' in line:
                l = line.strip().split()
                gene = Gene(contig=l[0], strand=l[6], start=int(l[3]), end=int(l[4]), seq=l[9])
                yield gene
コード例 #5
0
ファイル: genemark.py プロジェクト: alartin/quast
def parse_gmhmm_out(out_fpath):
    reading_gene = False
    reading_protein = False
    protein = ''
    genes_by_id = OrderedDict()
    gene_id = None
    with open(out_fpath) as f:
        for line in f:
            if line.startswith('>gene'):
                seq = []
                seq_id, contig_id = line.strip().split('\t')
                # >gene_2|GeneMark.hmm|57_nt|+|1|57	>NODE_3_length_713_cov_1.25228
                gene_id, _, seq_len, strand, left_index, right_index = seq_id.split(
                    '|')
                gene_id = gene_id[1:]
                contig_id = contig_id[1:]
                if 'nt' in seq_len:
                    reading_gene = True
                elif 'aa' in seq_len:
                    reading_protein = True
            elif reading_gene or reading_protein:
                if line.isspace():
                    left_index = int(left_index)
                    right_index = int(right_index)
                    if reading_gene:
                        seq = ''.join(seq)
                        reading_gene = False
                    elif reading_protein:
                        protein = ''.join(seq)
                        seq = []
                        reading_protein = False
                    #genes.append(Gene(contig_id, strand, left_index, right_index, str_seq))
                    gene = genes_by_id[gene_id] if gene_id in genes_by_id else \
                        Gene(contig=contig_id, start=left_index, end=right_index, strand=strand)
                    if seq:
                        gene.seq = seq
                        seq = []
                    if protein:
                        gene.protein = protein
                        protein = None
                    genes_by_id[gene_id] = gene
                else:
                    seq.append(line.strip())
    return list(genes_by_id.values())