def _sam_to_bam(bam_fn): if not bam_fn.endswith("bam"): bam_out = "%s.bam" % os.path.splitext(bam_fn)[0] cmd = "samtools view -Sbh {bam_fn} -o {bam_out}" do.run(cmd.format(**locals())) return bam_out return bam_fn
def _cmd_miraligner(fn, out_file, species, hairpin): """ Run miraligner for miRNA annotation """ tool = _get_miraligner() path_db = op.dirname(op.abspath(hairpin)) opts = "-Xms750m -Xmx4g" cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3" if not file_exists(out_file): do.run(cmd.format(**locals()), "miraligner with %s" % fn) shutil.move(out_file + ".mirna", out_file) return out_file
def _download_mirbase(args, version="CURRENT"): """ Download files from mirbase """ if not args.hairpin or not args.mirna: logger.info("Working with version %s" % version) hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz") mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz") if not file_exists(hairpin_fn): cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s && gunzip -f !$" % ( version, hairpin_fn) do.run(cmd_h, "download hairpin") if not file_exists(mirna_fn): cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % ( version, mirna_fn) do.run(cmd_m, "download mirna") else: return args.hairpin, args.mirna
def _bam_sort(bam_fn): bam_sort_by_n = op.splitext(bam_fn)[0] + "_sort.bam" if not file_exists(bam_sort_by_n): do.run(("samtools sort -n -o {bam_sort_by_n} {bam_fn}").format( **locals())) return bam_sort_by_n
def _bam_sort(bam_fn): bam_sort_by_n = os.path.splitext(bam_fn)[0] + "_sort.bam" runner.run( ("samtools sort -n -o {bam_sort_by_n} {bam_fn}").format(**locals())) return bam_sort_by_n
def _sam_to_bam(bam_fn): bam_out = "%s.bam" % os.path.splitext(bam_fn)[0] cmd = "samtools view -Sbh {bam_fn} -o {bam_out}" runner.run(cmd.format(**locals())) return bam_fn
# read sequences and score hits (ignore same sequence) handle = pysam.Samfile(sam, "rb") for line in handle: reference = handle.getrname(line.reference_id) name = line.query_name # sequence = line.query_sequence if not line.is_reverse else reverse_complement(line.query_sequence) if reference == name: continue # print([reference, name, line.get_tag("NM")]) distance = line.get_tag("NM") uniques[name].append(distance) uniques[reference].append(distance) # read parsed data and keep the ones with score > 10 edit distance for name in uniques: if min(uniques[name]) < 5: if name in source: source[name] = None return source # Map all vs all with razers3 source = _read_fasta(args.fa) sam = os.path.join(os.path.dirname(args.out), "modified.bam") runner.run(( "razers3 -dr 5 -i 75 -rr 80 -f -so 1 -o {output} {target} {query}").format( output=sam, target=args.fa, query=args.fa)) uniques = _parse_hits(sam, source) # Write uniques to fasta _write_fasta(uniques, args.out, args.max_size)
parser.add_argument("--universe", help="Set up universe sequences to avoid duplication.", default=None) args = parser.parse_args() random.seed(args.seed) mylog.initialize_logger(os.path.dirname(os.path.abspath(args.out))) logger = mylog.getLogger(__name__) # Read file to get all sequences longer than size - 2 size = args.size - 2 source = _read_fasta(args.fa, size) logger.info("%s was read: %s sequences were loaded" % (args.fa, len(source))) source = _update_ends(source) logger.info("source updated with extended nts: %s" % source) # Map all vs all with razers3 modified = _write_fasta(source, os.path.join(os.path.dirname(args.out), "modified.fa")) sam = os.path.join(os.path.dirname(args.out), "modified.bam") runner.run(("razers3 -i 75 -rr 80 -f -so 1 -o {output} {target} {query}").format(output=sam, target=modified, query=modified)) uniques = _parse_hits(sam, source) print(uniques) if args.universe: sam = os.path.join(os.path.dirname(args.out), "modified_vs_universe.sam") runner.run(("razers3 -i 75 -rr 80 -f -o {output} {target} {query}").format(output=sam, target=args.universe, query=modified)) uniques = _parse_hits(sam, uniques) print(uniques) # Write uniques to fasta _write_fasta(uniques, args.out)
def get_fasta(bed_file, ref, out_fa): """Run bedtools to get fasta from bed file""" cmd = "bedtools getfasta -s -fi {ref} -bed {bed_file} -fo {out_fa}" run(cmd.format(**locals()))