def _check_line(line, num, num_samples): """ Check file for minimum """ fields = read_gff_line(line) # Check seqID if not fields['chrom']: logger.error('MISSING seqID in line %s' % (num)) # Check source source = (fields['source']).lower() valid_source = False valid_sources = ["mirBase", "mirgeneDB"] if (any(s.lower() in source for s in valid_sources)): valid_source = True if valid_source is False: logger.error('INCORRECT SOURCE in line %s' % (num)) # Check type type = fields['type'] source = (fields['source']).lower() valid_type = False if type in ["ref_miRNA", "isomiR"]: valid_type = True if valid_type is False: logger.error('INCORRECT TYPE in line %s' % (num)) # Check start/end if not fields['start']: logger.error('MISSING START value in line %s' % (num)) if not fields['end']: logger.error('MISSING END value in line %s' % (num)) # Check strand if str(fields['strand']) not in ["+", "-"]: logger.error('INCORRECT STRAND in line %s' % (num)) # Check attribute-variant variant = (fields['attrb']['Variant']).lower() valid_variant = False valid_variants = version.GFFv[version.current] if (any(s.lower() in variant for s in valid_variants)): valid_variant = True if valid_variant is False: logger.error('INCORRECT VARIANT type in line %s' % (num)) # Check attribute-expression expression = fields['attrb']['Expression'].strip().split(",") expression = filter(None, expression) if len(expression) != num_samples: logger.error('INCORRECT number of EXPRESSION VALUES \ in line %s' % (num))
def read_reference(fn): """Read GFF into UID:Variant Args: *fn (str)*: GFF file. Returns: *srna (dict)*: dict with >>> {'UID': 'iso_snp:-2,...'} """ srna = dict() with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = read_gff_line(line) attr = cols['attrb'] srna[attr['UID']] = [_simplify(attr['Variant']), attr] return srna
def _compare_to_reference(fn, reference): same = 0 diff = list() extra = list() miss = list() results = list() seen = 0 seen_reference = set() with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = read_gff_line(line) attr = cols['attrb'] if attr['UID'] in reference: mirna = "Y" if attr['Name'] == reference[ attr['UID']][1]['Name'] else attr['Name'] accuracy = _accuracy(_simplify(attr['Variant']), reference[attr['UID']][0]) results.append([attr['UID'], "D", mirna, accuracy]) if _simplify(attr['Variant']) == reference[attr['UID']][0]: same += 1 else: diff.append("%s | reference: %s" % (line.strip(), reference[attr['UID']][1])) seen += 1 seen_reference.add(attr['UID']) else: extra.append("%s | extra" % line.strip()) results.append([ attr['UID'], "E", attr['Name'], _accuracy(_simplify(attr['Variant']), "") ]) for uid in reference: if uid not in seen_reference: results.append([uid, "M", "N", _accuracy("", reference[uid][0])]) miss.append("| miss %s" % reference[uid][1]) logger.info("Number of sequences found in reference: %s" % seen) logger.info("Number of sequences matches reference: %s" % same) logger.info("Number of sequences different than reference: %s" % len(diff)) logger.info("Number of sequences extra sequences: %s" % len(extra)) logger.info("Number of sequences missed sequences: %s" % len(miss)) return results
def _calc_stats(fn): """ Read files and parse into categories """ samples = _get_samples(fn) lines = [] seen = set() with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = read_gff_line(line) logger.debug("## STATS: attribute %s" % cols['attrb']) attr = cols['attrb'] if attr['Filter'] != "Pass": continue if "-".join([attr['UID'], attr['Variant'], attr['Name']]) in seen: continue seen.add("-".join([attr['UID'], attr['Variant'], attr['Name']])) lines.extend(_classify(cols['type'], attr, samples)) df = _summary(lines) return df
def _compare_to_reference(fn, reference): same = 0 diff = list() extra = list() miss = list() results = list() seen = 0 seen_reference = set() with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = read_gff_line(line) attr = cols['attrb'] if attr['UID'] in reference: mirna = "Y" if attr['Name'] == reference[attr['UID']][1]['Name'] else attr['Name'] accuracy = _accuracy(_simplify(attr['Variant']), reference[attr['UID']][0]) results.append([attr['UID'], "D", mirna, accuracy]) if _simplify(attr['Variant']) == reference[attr['UID']][0]: same += 1 else: diff.append("%s | reference: %s" % (line.strip(), reference[attr['UID']][1])) seen += 1 seen_reference.add(attr['UID']) else: extra.append("%s | extra" % line.strip()) results.append([attr['UID'], "E", attr['Name'], _accuracy(_simplify(attr['Variant']), "")]) for uid in reference: if uid not in seen_reference: results.append([uid, "M", "N", _accuracy("", reference[uid][0])]) miss.append("| miss %s" % reference[uid][1]) logger.info("Number of sequences found in reference: %s" % seen) logger.info("Number of sequences matches reference: %s" % same) logger.info("Number of sequences different than reference: %s" % len(diff)) logger.info("Number of sequences extra sequences: %s" % len(extra)) logger.info("Number of sequences missed sequences: %s" % len(miss)) return results
def _fix(line, expression): # Need to fix Read attribute since not usefull when multiple sample in a line. cols = read_gff_line(line) cols['attrb']['Expression'] = expression return paste_columns(cols, guess_format(line))
def test_read_line(self): """Read GFF/GTF line""" from mirtop.gff.body import read_gff_line with open("data/examples/gff/2samples.gff") as inh: for line in inh: print(read_gff_line(line))
def convert_gff_counts(args): """ Reads a GFF file to produces output file containing Expression counts Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_counts()*. Returns: *file (file)*: with columns like: UID miRNA Variant Sample1 Sample2 ... Sample N """ sep = "\t" variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add', 'iso_snp']) if args.add_extra: precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join([variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add_nt', 'iso_snp_nt']) logger.info("INFO Reading GFF file %s", args.gff) logger.info("INFO Writing TSV file to directory %s", args.out) gff_file = open(args.gff, 'r') out_file = op.join(args.out, "expression_counts.tsv") missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(",")) header = sep.join(['UID', 'Read', 'miRNA', 'Variant', variant_header, samples]) print(header, file=outh) break for mirna_line in gff_file: mirna_values = read_gff_line(mirna_line) Read = mirna_values["attrb"]["Read"] UID = mirna_values["attrb"]["UID"] mirna = mirna_values["attrb"]["Name"] parent = mirna_values["attrb"]["Parent"] variant = mirna_values["attrb"]["Variant"] try: read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(mirna_values["attrb"]["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if args.add_extra: if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join([cols_variants] + _expand(extra, True)) summary = sep.join([UID, Read, mirna, variant, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def read_file(fn, args): """ Read isomiR-SEA file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(fn))[0] hits = _get_hits(fn) logger.debug("ISOMIRSEA::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Isocode {isotag}; Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Hits: %s" % reads_in) return reads
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hit = len(set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hit}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads
def create_vcf(mirgff3, precursor, gtf, vcffile): """ Args: 'mirgff3(str)': File with mirGFF3 format that will be converted 'precursor(str)': Fasta format sequences of all miRNA hairpins 'gtf(str)': Genome coordinates 'vcffile': name of the file to be saved Returns: Nothing is returned, instead, a VCF file is generated """ #Check if the input files exist: try: gff3_file = open(mirgff3, "r", encoding="utf-8") if six.PY3 else open( mirgff3, "r") except IOError: print("Can't read the file", end=mirgff3) sys.exit() with gff3_file: data = gff3_file.read() if six.PY2: data = data.decode("utf-8-sig").encode("utf-8") gff3_data = data.split("\n") vcf_file = open(vcffile, "w") ver = "v4.3" # Current VCF version formatting vcf_file.write("##fileformat=VCF%s\n" % ver) date = datetime.datetime.now().strftime("%Y%m%d") vcf_file.write("##fileDate=%s\n" % date) source = "\n".join(s for s in gff3_data if "## source-ontology: " in s)[20:] line = 0 sample_names = [] while gff3_data[line][:2] == "##": if gff3_data[line][:19] == "## source-ontology:": source = gff3_data[line][20:] elif gff3_data[line][:11] == "## COLDATA:": sample_names = gff3_data[line][12:].split(",") line += 1 vcf_file.write("##source=%s\n" % source) vcf_file.write( '##INFO=<ID=NS,Type=Integer,Description="Number of samples"\n') vcf_file.write("##FILTER=<ID=REJECT,Description='" 'Filter not passed' "'>\n") vcf_file.write( '##FORMAT=<ID=TRC,Number=1,Type=Integer,Description="Total read count">\n' ) vcf_file.write( '##FORMAT=<ID=TSC,Number=1,Type=Integer,Description="Total SNP count">\n' ) vcf_file.write( '##FORMAT=<ID=TMC,Number=1,Type=Integer,Description="Total miRNA count">\n' ) vcf_file.write( '##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n') header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" # Adds Header for s in range(len(sample_names)): header = header + "\t" + sample_names[s] vcf_file.write(header) all_dict = dict( ) # initializing an empty dictionary where all info will be added key_list = [ ] # Initializing a list which will contain all the keys of the dictionary mirna_dict = dict( ) # initializing an empty dictionary where mirna info will be put n_SNP = 0 n_noSNP = 0 no_var = 0 hairpins = read_precursor(precursor) gff3 = read_gtf_to_precursor(gtf) gtf_dic = read_gtf_to_mirna(gtf) for line in range(0, len(gff3_data)): if not gff3_data[line]: continue if gff3_data[line][1] == "#": continue else: # Parsing the gff3 mirna lecture: gff_fields = read_gff_line(gff3_data[line]) gtf_name = gff_fields['attrb']['Name'] gtf_parent = gff_fields['attrb']['Parent'] if gtf_parent not in gff3: continue if gtf_name not in gff3[gtf_parent]: continue parent_ini_pos = gff3[gtf_parent][gtf_name][0] parent_end_pos = gff3[gtf_parent][gtf_name][1] ref_seq = (hairpins[gtf_parent][parent_ini_pos:parent_end_pos + 1]) vcf_chrom = gtf_dic[gtf_name][gtf_parent][0] vcf_pos = int(gff_fields['start']) + int( gtf_dic[gtf_name][gtf_parent][1]) hairpin = hairpins[gtf_parent] variants = gff_fields['attrb']['Variant'].split(",") logger.debug("VCF::Variant::%s" % variants) # Obtaining the iso_3p, iso_add3p and iso_5p values: var3p = [s for s in variants if 'iso_3p' in s] if len(var3p): var3p = int(var3p[0][7:]) # Position of iso_3p value else: var3p = 0 var_add3p = [s for s in variants if 'iso_add3p' in s] if len(var_add3p): var_add3p = int( var_add3p[0][10:]) # Position of iso_add3p value else: var_add3p = 0 var3p = var3p + var_add3p logger.debug("VCF::VAR_3p::%s" % var3p) var5p = [s for s in variants if 'iso_5p' in s] if len(var5p): var5p = int(var5p[0][7:]) # Position of iso_5p value else: var5p = 0 # logger.debug("VCF::VAR_5p::%s" % var5p) cigar = gff_fields['attrb']["Cigar"] # Obtaining all the variants from the cigar: if 1: (key_pos, key_var, vcf_ref, vcf_alt) = cigar_2_key( cigar, gff_fields['attrb']['Read'], ref_seq, vcf_pos, var5p, var3p, parent_ini_pos, parent_end_pos, hairpin) # Adding the variants to a dictionary and calculating all the fields of a vcf file format: if len(key_var) > 0: for s in range(len(key_var)): key_dict = vcf_chrom + '-' + str( key_pos[s]) + '-' + str(key_var[s]) raw_counts = gff_fields['attrb']['Expression'] raw_counts = [int(i) for i in raw_counts.split(',')] nozero_counts = [ int(i > 0) for i in raw_counts ] # counts for every sample if expr != 0. if gtf_name in mirna_dict: # Adding expression values to same mirnas mirna_dict[gtf_name]['Z'] = [ sum(x) for x in zip(mirna_dict[gtf_name]['Z'], raw_counts) ] else: mirna_dict[gtf_name] = {} mirna_dict[gtf_name]["Z"] = raw_counts if key_dict in all_dict: if all_dict[key_dict]["Type"] in [ "A", "C", "T", "G" ]: all_dict[key_dict]['X'] = [ sum(x) for x in zip( all_dict[key_dict]['X'], nozero_counts) ] all_dict[key_dict]['Y'] = [ sum(x) for x in zip( all_dict[key_dict]['Y'], raw_counts) ] else: key_list.append(key_dict) all_dict[key_dict] = {} all_dict[key_dict]["Chrom"] = vcf_chrom all_dict[key_dict]["Position"] = key_pos[s] all_dict[key_dict]["mirna"] = gtf_name all_dict[key_dict]["Type"] = key_var[s] if key_var[s][0] in ["A", "C", "T", "G"]: n_SNP += 1 all_dict[key_dict]["SNP"] = True all_dict[key_dict]["ID"] = gff_fields['attrb'][ 'Name'] + '-SNP' + str(n_SNP) all_dict[key_dict]['X'] = nozero_counts all_dict[key_dict]['Y'] = raw_counts else: n_noSNP += 1 all_dict[key_dict]["SNP"] = False all_dict[key_dict]["ID"] = gff_fields['attrb'][ 'Name'] + '-nonSNP' + str(n_noSNP) all_dict[key_dict]["Ref"] = vcf_ref[s] all_dict[key_dict]["Alt"] = vcf_alt[s] all_dict[key_dict]["Qual"] = "." all_dict[key_dict]["Filter"] = gff_fields['attrb'][ 'Filter'] all_dict[key_dict]["Info"] = "NS=" + str( len(sample_names)) else: no_var += 1 # Writing the VCF file: for s in key_list: variant_line = ( "\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (all_dict[s]["Chrom"], all_dict[s]["Position"], all_dict[s]["ID"], all_dict[s]["Ref"], all_dict[s]["Alt"], all_dict[s]["Qual"], all_dict[s]["Filter"], all_dict[s]["Info"])) if all_dict[s]["Type"] in ["A", "T", "C", "G"]: format_col = "TRC:TSC:TMC:GT" variant_line = variant_line + "\t" + format_col samples = "" for n in range(len(sample_names)): X = all_dict[s]["X"][n] Y = all_dict[s]["Y"][n] Z = mirna_dict[all_dict[s]["mirna"]]["Z"][n] # Calculating the genotype: if Y == 0: GT = "0|0" elif Z == Y: GT = "1|1" else: GT = "1|0" samples = samples + "\t" + str(X) + ":" + str(Y) + ":" + str( Z) + ":" + GT variant_line = variant_line + samples else: format_col = "" variant_line = variant_line + format_col vcf_file.write(variant_line) vcf_file.close()
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hit = len( set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hit}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append( [idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads