def convert(args): samples = [] database = mapper.guess_database(args.gtf) precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) for fn in args.files: read_file(fn, precursors, matures)
def _convert_file(gff, args): sep = "\t" precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join(['mism', 'add', 't5', 't3']) gff_file = open(gff, 'r') out_file = os.path.join(args.out, "%s_rawData.tsv" % os.path.splitext(os.path.basename(gff))[0]) missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(",")) header = sep.join(['seq', 'mir', variant_header, samples]) print(header, file=outh) break for mirna_line in gff_file: gff = feature(mirna_line) attr = gff.attributes UID = attr["UID"] Read = attr["Read"] mirna = attr["Name"] parent = attr["Parent"] variant = attr["Variant"] try: Read = read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(attr["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join(_expand(extra, True)) summary = sep.join([Read, mirna, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ if args.low_memory: read.reader(args) return None samples = [] database = mapper.guess_database(args) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() if args.keep_name and len(args.files) > 1: logger.warning("--keep-name when running multiple samples\n" "can generate wrong results if the\n" "name read is different across sample\n" "for the same sequence.") for fn in args.files: fn = op.normpath(fn) if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "manatee": out_dts[fn] = manatee.read_file(fn, database, args) elif args.format == "optimir": out_dts[fn] = optimir.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, header.make_tools(args.format)) _write(out_dts[fn], h, fn_out, args) # merge all reads for all samples into one dict if args.low_memory: return None merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, header.make_tools([args.format])), fn_merged_out, args)
def test_read(self): from mirtop.mirna import mapper, fasta from mirtop.libs import logger logger.initialize_logger("test_read_files", True, True) map_mir = mapper.read_gtf_to_precursor( "data/examples/annotate/hsa.gff3") print(map_mir) if map_mir["hsa-let-7a-1"]["hsa-let-7a-5p"][0] != 5: raise ValueError("GFF is not loaded correctly.") fasta_precursor = fasta.read_precursor( "data/examples/annotate/hairpin.fa", "hsa") print(fasta_precursor) fasta_precursor2 = fasta.read_precursor( "data/examples/annotate/hairpin.fa", None) print(fasta_precursor2) if fasta_precursor != fasta_precursor2: raise ValueError("species value generates two different dicts.") # read data/aligments/let7-perfect.bam return True
def test_prost(self): """testing reading prost files function""" from mirtop.libs import logger logger.initialize_logger("test", True, True) logger = logger.getLogger(__name__) from mirtop.mirna import fasta precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") fn = "data/examples/prost/prost.example.txt" from mirtop.importer import prost reads = prost.read_file(fn, precursors, "miRBasev21", "data/examples/annotate/hsa.gff3") annotate("data/example/prost/prost.example.txt", reads, True)
def test_prost(self): """testing reading prost files function""" from mirtop.libs import logger logger.initialize_logger("test", True, True) logger = logger.getLogger(__name__) from mirtop.mirna import fasta precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") fn = "data/examples/prost/prost.example.txt" from mirtop.importer import prost reads = prost.read_file( fn, precursors, "miRBasev21", "data/examples/annotate/hsa.gff3") annotate("data/example/prost/prost.example.txt", reads, True)
def convert(args): """ Main function to convert from GFF3 to isomiRs Bioc Package. Args: *args*: supported options for this sub-command. See *mirtop.libs.parse.add_subparser_export()*. """ precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) for fn in args.files: logger.info("Reading %s" % fn) _read_file(fn, precursors, matures, args.out)
def test_variant(self): """testing get mature sequence""" from mirtop.mirna import fasta, mapper from mirtop.mirna.realign import get_mature_sequence, \ align_from_variants precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") matures = mapper.read_gtf_to_precursor( "data/examples/annotate/hsa.gff3") res = get_mature_sequence("GAAAATTTTTTTTTTTAAAAG", [5, 15]) if res != "AAAATTTTTTTTTTTAAAA": raise ValueError("Results for GAAAATTTTTTTTTTTAAAAG was %s" % res) mature = get_mature_sequence(precursors["hsa-let-7a-1"], matures["hsa-let-7a-1"]["hsa-let-7a-5p"]) if mature != "GGGATGAGGTAGTAGGTTGTATAGTTTTAG": raise ValueError("Results for hsa-let-7a-5p is %s" % mature) res = align_from_variants("AGGTAGTAGGTTGTATAGTT", mature, "iso_5p:-2") if res: raise ValueError("Wrong alignment for test 1 %s" % res) res = align_from_variants("GATGAGGTAGTAGGTTGTATAGTT", mature, "iso_5p:+2") if res: raise ValueError("Wrong alignment for test 2 %s" % res) res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature, "iso_5p:-2,iso_add:2") if res: raise ValueError("Wrong alignment for test 3 %s" % res) res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature, "iso_5p:-2,iso_3p:2") if res: raise ValueError("Wrong alignment for test 4 %s" % res) res = align_from_variants("AGGTAGTAGGTTGTATAG", mature, "iso_5p:-2,iso_3p:-2") if res: raise ValueError("Wrong alignment for test 5 %s" % res) res = align_from_variants("AGGTAGTAGGTTGTATAGAA", mature, "iso_5p:-2,iso_3p:-2,iso_add:2") if res: raise ValueError("Wrong alignment for test 6 %s" % res) res = align_from_variants("AGGTAGTAGGATGTATAGTT", mature, "iso_5p:-2,iso_snp_central") if not res: if res[0][0] != 10: raise ValueError("Wrong alignment for test 7 %s" % res) res = align_from_variants("AGGTAGTAGGATGTATAGAA", mature, "iso_5p:-2,iso_3p:-2,iso_add:2") if res: raise ValueError("Wrong alignment for test 8 %s" % res)
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ samples = [] database = mapper.guess_database(args.gtf) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() for fn in args.files: if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench"]: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, "") _write(out_dts[fn], h, fn_out) # merge all reads for all samples into one dict merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, ""), fn_merged_out)
def test_alignment(self): """testing alignments function""" from mirtop.libs import logger logger.initialize_logger("test", True, True) logger = logger.getLogger(__name__) from mirtop.mirna import fasta, mapper precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") matures = mapper.read_gtf_to_precursor( "data/examples/annotate/hsa.gff3") # matures = mirtop.mirna.read_mature("data/examples/annotate/mirnas.gff", "hsa") def annotate(fn, precursors, matures): from mirtop.bam import bam from mirtop.gff import body reads = bam.read_bam(fn, precursors) ann = bam.annotate(reads, matures, precursors) gff = body.create(ann, "miRBase21", "example", fn + ".gff3", "#") print "\nlast1D\n" annotate("data/aligments/let7-last1D.sam", precursors, matures) #mirna TGAGGTAGTAGGTTGTATAGTT #seq AGAGGTAGTAGGTTGTA print "\n1D\n" annotate("data/aligments/let7-1D.sam", precursors, matures) #mirna TGAGGTAG-TAGGTTGTATAGTT #seq TGAGGTAGGTAGGTTGTATAGTTA print "\nlast7M1I\n" annotate("data/aligments/let7-last7M1I.sam", precursors, matures) #mirna TGAGGTAGTAGGTTGTATAGTT #seq TGAGGTAGTAGGTTGTA-AGT print "\nmiddle1D\n" annotate("data/aligments/let7-middle1D.sam", precursors, matures) #mirna TGAGGTAGTAGGTTGTATAGTT #seq TGAGGTAGTAGGTTGTATAGTT print "\nperfect\n" annotate("data/aligments/let7-perfect.sam", precursors, matures) #mirna TGAGGTAGTAGGTTGTATAGTT #seq TGAGGTAGTAGGTTGTATAG (3tt 3TT) print "\ntriming\n" annotate("data/aligments/let7-triming.sam", precursors, matures)
def test_srnabench(self): """testing reading seqbuster files function""" from mirtop.libs import logger logger.initialize_logger("test", True, True) logger = logger.getLogger(__name__) from mirtop.mirna import fasta, mapper precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") matures = mapper.read_gtf_to_precursor( "data/examples/annotate/hsa.gff3") def annotate(fn, precursors, matures): from mirtop.importer import srnabench from mirtop.bam import bam reads = srnabench.read_file(fn, precursors) ann = bam.annotate(reads, matures, precursors) return True print "\nsRNAbench\n" annotate("data/examples/srnabench/reads.annotation", precursors, matures)
def test_collapse(self): """testing GFF function""" from mirtop.libs import logger from mirtop.mirna import mapper, fasta from mirtop.gff import body, header logger.initialize_logger("test", True, True) logger = logger.getLogger(__name__) precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") # depend on https://github.com/miRTop/mirtop/issues/6 matures = mapper.read_gtf_to_precursor( "data/examples/annotate/hsa.gff3") # matures = mirtop.mirna.read_mature("data/examples/annotate/mirnas.gff", "hsa") from mirtop.bam import bam bam_fn = "data/aligments/collapsing-isomirs.sam" reads = bam.read_bam(bam_fn, precursors) ann = bam.annotate(reads, matures, precursors) fn = bam_fn + ".gff" h = header.create(bam_fn, ["example"], "miRBase21") gff = body.create(ann, "miRBase21", "example", fn, header) print gff return True
def reader(args): """ Realign BAM hits to miRBase to get better accuracy and annotation """ samples = [] database = mapper.guess_database(args) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch if args.keep_name and len(args.files) > 1: logger.warning("--keep-name when running multiple samples\n" "can generate wrong results if the\n" "name read is different across sample\n" "for the same sequence.") for fn in args.files: fn = op.normpath(fn) if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) h = header.create([sample], args.database, "") out_handle = open(fn_out, 'w') print(h, file=out_handle) if args.format == "BAM": if args.genomic: low_memory_genomic_bam(fn, sample, out_handle, args) else: low_memory_bam(fn, sample, out_handle, args) elif args.format == "seqbuster": seqbuster.read_file_low_memory(fn, sample, args, out_handle) else: raise ValueError("%s not supported for low memory" % args.format) out_handle.close()
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ database = mapper.guess_database(args.gtf) # hairpin, mirna = download_mirbase(args) precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) # check numnbers of miRNA and precursors read # print message if numbers mismatch out_dts = dict() for fn in args.files: sample = op.splitext(op.basename(fn))[0] fn_out = op.join(args.out, sample + ".gff") if args.format == "BAM": reads = _read_bam(fn, precursors) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, precursors) custom = seqbuster.header() elif args.format == "srnabench": reads = srnabench.read_gile(fn, precursors) h = header.create([sample], database, "") ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, fn_out, h)
def annotate(fn, read_file, load=False, create=True): import argparse args = argparse.Namespace() args.hairpin = "data/examples/annotate/hairpin.fa" args.sps = "hsa" args.gtf = "data/examples/annotate/hsa.gff3" args.add_extra = True args.out_format = "gtf" from mirtop.mirna import fasta, mapper precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) args.precursors = precursors args.matures = matures args.database = mapper.guess_database(args.gtf) from mirtop.mirna import annotate from mirtop.gff import body if not load: reads = read_file(fn, args) else: reads = read_file if create: ann = annotate.annotate(reads, matures, precursors) body = body.create(ann, "miRBase21", "Example", args) return body
def test_spikeins(self): """Test spikeins reading and annotation""" from mirtop.libs import spikeins from mirtop.mirna.realign import get_mature_sequence load = spikeins.read_spikeins("data/examples/spikeins/spikeins.fa") print(load) load1 = load['spikein-1'] mature_from_data = get_mature_sequence(load1['precursor'], load1['position'], exact=True) if mature_from_data != load1['mature']: raise ValueError("Sequences doesn't match \n%s\n%s" % (mature_from_data, load1['mature'])) file_fasta = "data/examples/spikeins/spikeins_pre.fasta" file_gff = "data/examples/spikeins/spikeins_pre.gff" spikeins.write_precursors(load, file_fasta) spikeins.write_gff(load, file_gff) from mirtop.mirna import mapper, fasta map_mir = mapper.read_gtf_to_mirna(file_gff) print(map_mir) fasta_precursor = fasta.read_precursor(file_fasta, None) print(fasta_precursor)
def convert_gff_counts(args): """ Reads a GFF file to produces output file containing Expression counts Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_counts()*. Returns: *file (file)*: with columns like: UID miRNA Variant Sample1 Sample2 ... Sample N """ sep = "\t" variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add3p', 'iso_snp']) if args.add_extra: precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join([ variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add3p_nt', 'iso_snp_nt' ]) logger.info("INFO Reading GFF file %s", args.gff) logger.info("INFO Writing TSV file to directory %s", args.out) gff_file = open(args.gff, 'r') out_file = op.join(args.out, "%s.tsv" % op.splitext(op.basename(args.gff))[0]) missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:") [1].strip().split(",")) header = sep.join([ 'UID', 'Read', 'miRNA', 'Variant', variant_header, samples ]) print(header, file=outh) break for mirna_line in gff_file: gff = feature(mirna_line) attr = gff.attributes UID = attr["UID"] Read = attr["Read"] mirna = attr["Name"] parent = attr["Parent"] variant = attr["Variant"] try: read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(attr["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if args.add_extra: if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join([cols_variants] + _expand(extra, True)) summary = sep.join( [UID, Read, mirna, variant, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
help="give expression", default=False) parser.add_option("-p", "--prefix", help="output name") parser.add_option("--seed", help="set up seed for reproducibility.", default = None) (options, args) = parser.parse_args() if options.seed: random.seed(options.seed) full_fq = "%s_full.fq" % options.prefix clean_fq = "%s_clean.fq" % options.prefix out_gff = "%s.gff" % options.prefix if os.path.exists(full_fq): os.remove(full_fq) if os.path.exists(clean_fq): os.remove(clean_fq) pre = fasta.read_precursor(options.fa, "") mir = mapper.read_gtf_to_precursor(options.gtf) nt = ['A', 'T', 'G', 'C'] gffs = dict() h = header.create(["sampleX"], "miRBase1", "") for precursor in pre: seq = pre[precursor] gffs.update(create_iso(precursor, mir, seq, options.numsim, options.exp)) _write(gffs, h, out_gff)
def convert_gff_counts(args): """ Reads a GFF file to produces output file containing Expression counts Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_counts()*. Returns: *file (file)*: with columns like: UID miRNA Variant Sample1 Sample2 ... Sample N """ sep = "\t" variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add', 'iso_snp']) if args.add_extra: precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join([variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add_nt', 'iso_snp_nt']) logger.info("INFO Reading GFF file %s", args.gff) logger.info("INFO Writing TSV file to directory %s", args.out) gff_file = open(args.gff, 'r') out_file = op.join(args.out, "expression_counts.tsv") missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(",")) header = sep.join(['UID', 'Read', 'miRNA', 'Variant', variant_header, samples]) print(header, file=outh) break for mirna_line in gff_file: mirna_values = read_gff_line(mirna_line) Read = mirna_values["attrb"]["Read"] UID = mirna_values["attrb"]["UID"] mirna = mirna_values["attrb"]["Name"] parent = mirna_values["attrb"]["Parent"] variant = mirna_values["attrb"]["Variant"] try: read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(mirna_values["attrb"]["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if args.add_extra: if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join([cols_variants] + _expand(extra, True)) summary = sep.join([UID, Read, mirna, variant, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def create_vcf(mirgff3, precursor, gtf, vcffile): """ Args: 'mirgff3(str)': File with mirGFF3 format that will be converted 'precursor(str)': Fasta format sequences of all miRNA hairpins 'gtf(str)': Genome coordinates 'vcffile': name of the file to be saved Returns: Nothing is returned, instead, a VCF file is generated """ #Check if the input files exist: try: gff3_file = open(mirgff3, "r", encoding="utf-8") if six.PY3 else open( mirgff3, "r") except IOError: print("Can't read the file", end=mirgff3) sys.exit() with gff3_file: data = gff3_file.read() if six.PY2: data = data.decode("utf-8-sig").encode("utf-8") gff3_data = data.split("\n") vcf_file = open(vcffile, "w") ver = "v4.3" # Current VCF version formatting vcf_file.write("##fileformat=VCF%s\n" % ver) date = datetime.datetime.now().strftime("%Y%m%d") vcf_file.write("##fileDate=%s\n" % date) source = "\n".join(s for s in gff3_data if "## source-ontology: " in s)[20:] line = 0 sample_names = [] while gff3_data[line][:2] == "##": if gff3_data[line][:19] == "## source-ontology:": source = gff3_data[line][20:] elif gff3_data[line][:11] == "## COLDATA:": sample_names = gff3_data[line][12:].split(",") line += 1 vcf_file.write("##source=%s\n" % source) vcf_file.write( '##INFO=<ID=NS,Type=Integer,Description="Number of samples"\n') vcf_file.write("##FILTER=<ID=REJECT,Description='" 'Filter not passed' "'>\n") vcf_file.write( '##FORMAT=<ID=TRC,Number=1,Type=Integer,Description="Total read count">\n' ) vcf_file.write( '##FORMAT=<ID=TSC,Number=1,Type=Integer,Description="Total SNP count">\n' ) vcf_file.write( '##FORMAT=<ID=TMC,Number=1,Type=Integer,Description="Total miRNA count">\n' ) vcf_file.write( '##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n') header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" # Adds Header for s in range(len(sample_names)): header = header + "\t" + sample_names[s] vcf_file.write(header) all_dict = dict( ) # initializing an empty dictionary where all info will be added key_list = [ ] # Initializing a list which will contain all the keys of the dictionary mirna_dict = dict( ) # initializing an empty dictionary where mirna info will be put n_SNP = 0 n_noSNP = 0 no_var = 0 hairpins = read_precursor(precursor) gff3 = read_gtf_to_precursor(gtf) gtf_dic = read_gtf_to_mirna(gtf) for line in range(0, len(gff3_data)): if not gff3_data[line]: continue if gff3_data[line][1] == "#": continue else: # Parsing the gff3 mirna lecture: gff_fields = read_gff_line(gff3_data[line]) gtf_name = gff_fields['attrb']['Name'] gtf_parent = gff_fields['attrb']['Parent'] if gtf_parent not in gff3: continue if gtf_name not in gff3[gtf_parent]: continue parent_ini_pos = gff3[gtf_parent][gtf_name][0] parent_end_pos = gff3[gtf_parent][gtf_name][1] ref_seq = (hairpins[gtf_parent][parent_ini_pos:parent_end_pos + 1]) vcf_chrom = gtf_dic[gtf_name][gtf_parent][0] vcf_pos = int(gff_fields['start']) + int( gtf_dic[gtf_name][gtf_parent][1]) hairpin = hairpins[gtf_parent] variants = gff_fields['attrb']['Variant'].split(",") logger.debug("VCF::Variant::%s" % variants) # Obtaining the iso_3p, iso_add3p and iso_5p values: var3p = [s for s in variants if 'iso_3p' in s] if len(var3p): var3p = int(var3p[0][7:]) # Position of iso_3p value else: var3p = 0 var_add3p = [s for s in variants if 'iso_add3p' in s] if len(var_add3p): var_add3p = int( var_add3p[0][10:]) # Position of iso_add3p value else: var_add3p = 0 var3p = var3p + var_add3p logger.debug("VCF::VAR_3p::%s" % var3p) var5p = [s for s in variants if 'iso_5p' in s] if len(var5p): var5p = int(var5p[0][7:]) # Position of iso_5p value else: var5p = 0 # logger.debug("VCF::VAR_5p::%s" % var5p) cigar = gff_fields['attrb']["Cigar"] # Obtaining all the variants from the cigar: if 1: (key_pos, key_var, vcf_ref, vcf_alt) = cigar_2_key( cigar, gff_fields['attrb']['Read'], ref_seq, vcf_pos, var5p, var3p, parent_ini_pos, parent_end_pos, hairpin) # Adding the variants to a dictionary and calculating all the fields of a vcf file format: if len(key_var) > 0: for s in range(len(key_var)): key_dict = vcf_chrom + '-' + str( key_pos[s]) + '-' + str(key_var[s]) raw_counts = gff_fields['attrb']['Expression'] raw_counts = [int(i) for i in raw_counts.split(',')] nozero_counts = [ int(i > 0) for i in raw_counts ] # counts for every sample if expr != 0. if gtf_name in mirna_dict: # Adding expression values to same mirnas mirna_dict[gtf_name]['Z'] = [ sum(x) for x in zip(mirna_dict[gtf_name]['Z'], raw_counts) ] else: mirna_dict[gtf_name] = {} mirna_dict[gtf_name]["Z"] = raw_counts if key_dict in all_dict: if all_dict[key_dict]["Type"] in [ "A", "C", "T", "G" ]: all_dict[key_dict]['X'] = [ sum(x) for x in zip( all_dict[key_dict]['X'], nozero_counts) ] all_dict[key_dict]['Y'] = [ sum(x) for x in zip( all_dict[key_dict]['Y'], raw_counts) ] else: key_list.append(key_dict) all_dict[key_dict] = {} all_dict[key_dict]["Chrom"] = vcf_chrom all_dict[key_dict]["Position"] = key_pos[s] all_dict[key_dict]["mirna"] = gtf_name all_dict[key_dict]["Type"] = key_var[s] if key_var[s][0] in ["A", "C", "T", "G"]: n_SNP += 1 all_dict[key_dict]["SNP"] = True all_dict[key_dict]["ID"] = gff_fields['attrb'][ 'Name'] + '-SNP' + str(n_SNP) all_dict[key_dict]['X'] = nozero_counts all_dict[key_dict]['Y'] = raw_counts else: n_noSNP += 1 all_dict[key_dict]["SNP"] = False all_dict[key_dict]["ID"] = gff_fields['attrb'][ 'Name'] + '-nonSNP' + str(n_noSNP) all_dict[key_dict]["Ref"] = vcf_ref[s] all_dict[key_dict]["Alt"] = vcf_alt[s] all_dict[key_dict]["Qual"] = "." all_dict[key_dict]["Filter"] = gff_fields['attrb'][ 'Filter'] all_dict[key_dict]["Info"] = "NS=" + str( len(sample_names)) else: no_var += 1 # Writing the VCF file: for s in key_list: variant_line = ( "\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (all_dict[s]["Chrom"], all_dict[s]["Position"], all_dict[s]["ID"], all_dict[s]["Ref"], all_dict[s]["Alt"], all_dict[s]["Qual"], all_dict[s]["Filter"], all_dict[s]["Info"])) if all_dict[s]["Type"] in ["A", "T", "C", "G"]: format_col = "TRC:TSC:TMC:GT" variant_line = variant_line + "\t" + format_col samples = "" for n in range(len(sample_names)): X = all_dict[s]["X"][n] Y = all_dict[s]["Y"][n] Z = mirna_dict[all_dict[s]["mirna"]]["Z"][n] # Calculating the genotype: if Y == 0: GT = "0|0" elif Z == Y: GT = "1|1" else: GT = "1|0" samples = samples + "\t" + str(X) + ":" + str(Y) + ":" + str( Z) + ":" + GT variant_line = variant_line + samples else: format_col = "" variant_line = variant_line + format_col vcf_file.write(variant_line) vcf_file.close()
default=False) parser.add_option("-p", "--prefix", help="output name") parser.add_option("--seed", help="set up seed for reproducibility.", default=None) (options, args) = parser.parse_args() if options.seed: random.seed(options.seed) full_fq = "%s_full.fq" % options.prefix clean_fq = "%s_clean.fq" % options.prefix out_gff = "%s.gff" % options.prefix if os.path.exists(full_fq): os.remove(full_fq) if os.path.exists(clean_fq): os.remove(clean_fq) pre = fasta.read_precursor(options.fa, "") mir = mapper.read_gtf_to_precursor(options.gtf) nt = ['A', 'T', 'G', 'C'] gffs = dict() h = header.create(["sampleX"], "miRBase1", "") for precursor in pre: seq = pre[precursor] gffs.update(create_iso(precursor, mir, seq, options.numsim, options.exp)) _write(gffs, h, out_gff)