def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess([ tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for ind, seq in read_fasta(fasta_fpath): ind = re.sub('[/. ]', '_', ind) contig_path = os.path.join(base_dir, ind + '.fasta') gff_path = os.path.join(base_dir, ind + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: return None, None, None, None out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff') unique, total = set(), 0 genes = [] cnt = [0] * len(gene_lengths) for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start:end + 1] else: gene_seq = rev_comp(contigs[contig][start:end + 1]) if gene_seq not in unique: unique.add(gene_seq) genes.append((gene_id, gene_seq)) for idx, gene_length in enumerate(gene_lengths): cnt[idx] += end - start > gene_length if OUTPUT_FASTA: out_fasta_path = out_fpath + '_genes.fasta' write_fasta(out_fasta_path, genes) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, len(unique), total, cnt
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for ind, seq in read_fasta(fasta_fpath): contig_path = os.path.join(base_dir, ind + '.fasta') gff_path = os.path.join(base_dir, ind + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(fasta_fpath)) return None, None, None, None out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff') unique, total = set(), 0 genes = [] cnt = [0] * len(gene_lengths) for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start:end + 1] else: gene_seq = rev_comp(contigs[contig][start:end + 1]) if gene_seq not in unique: unique.add(gene_seq) genes.append((gene_id, gene_seq)) for idx, gene_length in enumerate(gene_lengths): cnt[idx] += end - start > gene_length if OUTPUT_FASTA: out_fasta_path = out_fpath + '_genes.fasta' write_fasta(out_fasta_path, genes) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, len(unique), total, cnt
import sys import os sys.path.append(os.path.join(os.path.abspath(sys.path[0]), '../')) import libs from libs import fastaparser if len(sys.argv) <= 3 or len(sys.argv) >= 6: print("Returns [reverse-complement] sequence from START to END position from each entry of input fasta") print("Usage: " + sys.argv[0] + " <input fasta> <START> <END, -1 for the end> [any string -- optional parameter for reverse-complement]") sys.exit() inp=sys.argv[1] start=int(sys.argv[2]) end=int(sys.argv[3]) reverse = False if len(sys.argv) == 5: reverse = True for tup in fastaparser.read_fasta(inp): cur_start = min(start, len(tup[1])) if end == -1: cur_end = len(tup[1]) else: cur_end = min(end, len(tup[1])) print (">" + tup[0] + "_cropped_" + str(cur_start) + "_" + str(cur_end)) if reverse: print (fastaparser.rev_comp(tup[1][cur_start - 1 : cur_end])) else: print (tup[1][cur_start - 1 : cur_end])