def find_closest(peak, starts_plus, stops_minus, cds_dict, ann_dict, d_threshold, inside, if_bed): center = int(peak.name) distances = [('+', x[0], x[1]-center) for x in starts_plus] distances.extend([('-', x[0], center-x[1]+1) for x in stops_minus]); distances = [x for x in distances if x[2]>-1*inside] strand, gene_name, mindistance = min(distances, key = lambda x: abs(x[2])) #print(gene_name) if(abs(mindistance) <= d_threshold): #gene_name = "_".join(gene_name.split("_")[:-1]) cds = cds_dict[gene_name] if(strand == '+'): atg = cds.start - center if(strand == '-'): atg = center - cds.end + 1 gene_name = gene_name.split("-")[1] ann = ann_dict.get(gene_name); if(ann): genesymbol = ann[0] annotation = ann[1] function = ann[2] else: genesymbol = gene_name annotation = 'Unknown' function = "Unknown" if(if_bed): return construct_gff_interval(peak.chrom, peak.start, peak.stop, 'consensus_region', score='0', strand=strand, source='un', frame='.', attrs=[("Name", peak.name), ("annotation", annotation), ("function", function), ("gene", gene_name), ("genesymbol", genesymbol), ("tss", mindistance), ("atg", abs(atg))]) else: return construct_gff_interval(peak.chrom, peak.start, peak.stop, 'consensus_region', score='0', strand=strand, source='un', frame='.', attrs=[("Name", peak.attrs['Name']), ("maxcov", peak.attrs['maxcov']), ("zscores", peak.attrs['zscores']), ("peakpos", peak.attrs['peakpos']), ("annotation", annotation), ("function", function), ("gene", gene_name), ("genesymbol", genesymbol), ("tss", mindistance), ("atg", abs(atg))]) else: return None
def print_compiled(compiled, size): temp_d = dict(compiled) compiled_processed = [temp_d.get(x, None) for x in range(size)] area_coverage = ",".join( [x.attrs['area_coverage'] if x else '0' for x in compiled_processed]) topcoverage = ",".join( [x.attrs['topcoverage'] if x else '0' for x in compiled_processed]) compiled_processed = [x for x in compiled_processed if x] compiled = [x[1] for x in sorted(compiled, key=lambda x: x[0])] pos = int( sum([ int(x.name) * float(x.attrs['topcoverage']) for x in compiled_processed ]) / sum([float(x.attrs['topcoverage']) for x in compiled_processed])) start = min([x.start for x in compiled_processed]) stop = min([x.stop for x in compiled_processed]) consensus = construct_gff_interval(compiled[0].chrom, start, stop, 'consensus', score='0', strand='.', source='.', frame='.', attrs=[('Name', pos), ('topcoverage', topcoverage), ('area_coverage', area_coverage) ]) sys.stdout.write(str(consensus))
def annotate_position(peak, tr_dict, maxd, inside): if(peak.chrom not in tr_dict): return None center = int(peak.name) tr_plus, tr_minus = tr_dict[peak.chrom] if(peak.strand == '+'): distances = [(tr, center-tr.stop+1) for tr in tr_minus]; else: distances = [(tr, tr.start-center) for tr in tr_plus] distances = [x for x in distances if x[1]>-1*inside] if(not distances): return None transcript, mindistance = min(distances, key = lambda x: abs(x[1])) if(abs(mindistance) <= maxd): gtype = 'upstream' atg = mindistance + float(transcript.attrs['distance']) old_attrs = peak.attrs new_attrs = [("annotation", transcript.attrs['annotation']), ("function", transcript.attrs['function']), ("gene", transcript.name), ("genesymbol", transcript.attrs['genesymbol']), ("cg", transcript.attrs.get('cg', 'unknown')), ("tss", mindistance), ("atg", atg), ("gtype", gtype), ("anti", "1")] for k, v in new_attrs: old_attrs[k] = v; return construct_gff_interval( peak.chrom, peak.start, peak.stop, 'annotated', score=peak.score, strand=transcript.strand, source='annotate.py', frame='.', attrs=old_attrs.items()) else: return None
def annotate_position(peak, tr_dict, maxd, inside): center = int(peak.name) if (peak.chrom not in tr_dict): mindistance = float("NaN") gtype = "intergenic" transcript = STUB_TR else: center = int(peak.name) tr_plus, tr_minus = tr_dict[peak.chrom] distances = [(tr, tr.start - center) for tr in tr_plus] distances.extend([(tr, center - tr.stop + 1) for tr in tr_minus]) distances = [x for x in distances if x[1] > -1 * inside] transcript, mindistance = min(distances, key=lambda x: abs(x[1])) if (abs(mindistance) <= maxd): gtype = 'upstream' else: pairs = [(tr, center - tr.start, tr.stop - center - 1) for tr in tr_plus + tr_minus] pairs = [x for x in pairs if x[1] >= 0 and x[2] >= 0] if (pairs): gtype = 'gene' transcript = pairs[0][0] if (transcript.strand == '+'): mindistance = -1 * pairs[0][1] else: mindistance = -1 * pairs[0][2] else: gtype = 'intergenic' atg = mindistance + float(transcript.attrs['distance']) if (str(atg) != "nan"): atg = "%d" % atg attrs = [("Name", peak.name), ("annotation", transcript.attrs['annotation']), ("function", transcript.attrs['function']), ("gene", transcript.name), ("genesymbol", transcript.attrs['genesymbol']), ("cg", transcript.attrs['cg']), ("tss", mindistance), ("atg", atg), ("gtype", gtype)] if (if_bed): return construct_gff_interval(peak.chrom, peak.start, peak.stop, 'annotated', score=peak.score, strand=transcript.strand, source='annotate.py', frame='.', attrs=attrs) else: for attr_name, attr_value in attrs: peak.attrs[attr_name] = str(attr_value) return peak
def merge(intervals, name): chrom = intervals[0].chrom start = min([int(x.start) for x in intervals]) stop = max([int(x.stop) for x in intervals]) score = str(max([float(x.score) for x in intervals])) strand = intervals[0].strand gap = str(min([int(x.attrs['gap']) for x in intervals], key = abs)) unique_reads = sum([int(x.attrs.get('n_uniq', 1)) for x in intervals]) chscore = str(max([x.attrs['chscore'] for x in intervals])) return construct_gff_interval(chrom, start, stop, 'interaction', score=score, strand=strand, source='chiflex', frame='.', attrs=[("ID", name), ('gap', gap), ('chscore', chscore), ('n_uniq', unique_reads)])
def find_closest(peak, starts_plus, stops_minus, pam_plus, pam_minus, d_threshold, inside): center = int(peak.name) distances = [('+', x[0], x[1]-center) for x in starts_plus] distances.extend([('-', x[0], center-x[1]+1) for x in stops_minus]); distances = [x for x in distances if x[2]>-1*inside] strand, gene_name, mindistance = min(distances, key = lambda x: abs(x[2])) if(abs(mindistance) <= d_threshold): pam_sense, pam_antisense = get_pams_around_peak(center, strand, pam_plus, pam_minus) pam_min = min([abs(x) for x in [pam_sense, pam_antisense]]) return construct_gff_interval( peak.chrom, peak.start, peak.stop, 'consensus_region', score=str(peak.score), strand=strand, source='un', frame='.', attrs=[("Name", peak.name),("tss", mindistance), ("gene", gene_name), ('pam_sense', pam_sense), ('pam_antisense', pam_antisense), ('pam_min', pam_min) ]) else: return False
def area2interval(area, pflank, clear): chrom = area[0][4] start = area[0][0] stop = area[-1][0]+1 score = str(max([x[3] for x in area])) distance = int(abs((np.mean([x[2] for x in area])))) peak = area[0][1] if(distance<pflank): _type = 'in' elif(distance<clear): _type = 'unk' else: _type = 'out' return construct_gff_interval(chrom, start, stop, 'long_at', score=score, strand='.', source='un', frame='.', attrs=[("peak", peak), ("distance", distance), ("type", _type), ("ID", str((stop+start)//2))])
def annotate_position(peak, tr_dict, maxd, inside): #sys.stderr.write(str(peak)) center = int(peak.name) tr_plus, tr_minus = tr_dict.get(peak.chrom, [None, None]) if (not tr_plus): mindistance = float("NaN") gtype = "intergenic" transcript = STUB_TR anti = get_anti(STUB_TR, 'NaN') else: distances_plus = [(tr, tr.start - center) for tr in tr_plus] distances_minus = [(tr, center - tr.stop + 1) for tr in tr_minus] distances_plus = [x for x in distances_plus if x[1] > -1 * inside] distances_minus = [x for x in distances_minus if x[1] > -1 * inside] if (distances_plus): transcript_plus, mindistance_plus = min(distances_plus, key=lambda x: abs(x[1])) else: transcript_plus, mindistance_plus = STUB_TR, float("inf") if (distances_minus): transcript_minus, mindistance_minus = min(distances_minus, key=lambda x: abs(x[1])) else: transcript_minus, mindistance_minus = STUB_TR, float("inf") if (mindistance_plus < mindistance_minus): transcript, mindistance = transcript_plus, mindistance_plus anti = get_anti(transcript_minus, mindistance_minus) else: transcript, mindistance = transcript_minus, mindistance_minus anti = get_anti(transcript_plus, mindistance_plus) if (abs(mindistance) <= maxd): gtype = 'upstream' else: pairs = [(tr, center - tr.start, tr.stop - center - 1) for tr in tr_plus + tr_minus] pairs = [x for x in pairs if x[1] >= 0 and x[2] >= 0] if (pairs): gtype = 'gene' transcript = pairs[0][0] if (transcript.strand == '+'): mindistance = -1 * pairs[0][1] else: mindistance = -1 * pairs[0][2] else: gtype = 'intergenic' atg = mindistance + float(transcript.attrs['distance']) if (str(atg) != "nan"): atg = "%d" % atg attrs = [("Name", peak.name), ("annotation", transcript.attrs['annotation']), ("function", transcript.attrs['function']), ("gene", transcript.name), ("genesymbol", transcript.attrs['genesymbol']), ("cg", transcript.attrs.get('cg', 'unknown')), ("tss", mindistance), ("atg", atg), ("gtype", gtype)] + anti if (if_bed): return construct_gff_interval(peak.chrom, peak.start, peak.stop, 'annotated', score=peak.score, strand=transcript.strand, source='annotate.py', frame='.', attrs=attrs) else: for attr_name, attr_value in attrs: peak.attrs[attr_name] = str(attr_value) peak.strand = transcript.strand return peak
nargs='?', required=False, type=str, help="Path to the output plot directory") args = parser.parse_args() if (os.stat(args.path).st_size == 0): sys.exit("###annotate\nInput file is empty, empty output is produced\n") STUB_TR = construct_gff_interval("unknown", 0, 2, 'fake', score='0', strand="+", source='ff', frame='.', attrs=[("Name", "fake"), ("annotation", "None"), ("function", "None"), ("genesymbol", "fake"), ("distance", "NaN")]) ### Annotate genomically ### def get_anti(tr, mind): return [('anti_gene', tr.name), ('anti_genesymbol', tr.attrs['genesymbol']), ('anti_tss', str(mind))]
def clarify(i1, i2, breakpoint, antisense): gap = int(i1.attrs['gap']) #clarify breakpoint #clarify strandness if (antisense): if (i1.strand == '+'): cs1 = construct_gff_interval(chrom=i2.chrom, start=i2.start + breakpoint, stop=i2.stop, feature='ch', score=i2.score, strand='-', source='.', frame='.', attrs=i2.attrs.items()) cs2 = construct_gff_interval(chrom=i1.chrom, start=i1.start, stop=i1.stop + gap + breakpoint, feature='ch', score=i1.score, strand='-', source='.', frame='.', attrs=i1.attrs.items()) else: cs1 = construct_gff_interval(chrom=i2.chrom, start=i2.start, stop=i2.stop - breakpoint, feature='ch', score=i2.score, strand='+', source='.', frame='.', attrs=i2.attrs.items()) cs2 = construct_gff_interval(chrom=i1.chrom, start=i1.start - gap - breakpoint, stop=i1.stop, feature='ch', score=i1.score, strand='+', source='.', frame='.', attrs=i1.attrs.items()) else: if (i1.strand == '+'): cs1 = construct_gff_interval(chrom=i1.chrom, start=i1.start, stop=i1.stop + gap + breakpoint, feature='ch', score=i1.score, strand=i1.strand, source='.', frame='.', attrs=i1.attrs.items()) cs2 = construct_gff_interval(chrom=i2.chrom, start=i2.start + breakpoint, stop=i2.stop, feature='ch', score=i2.score, strand=i2.strand, source='.', frame='.', attrs=i2.attrs.items()) else: cs1 = construct_gff_interval(chrom=i1.chrom, start=i1.start - gap - breakpoint, stop=i1.stop, feature='ch', score=i1.score, strand=i1.strand, source='.', frame='.', attrs=i1.attrs.items()) cs2 = construct_gff_interval(chrom=i2.chrom, start=i2.start, stop=i2.stop - breakpoint, feature='ch', score=i2.score, strand=i2.strand, source='.', frame='.', attrs=i2.attrs.items()) return cs1, cs2
for (chrom, geneid), features in gene_id_dict.items(): fd = dict([ (int(x.type == 'gene'), x) for x in features]) transcript = fd[0] gene = fd[1] genesymbol = gene.qualifiers.get('gene', [geneid])[0] annotation = transcript.qualifiers['product'][0].replace(";", ' ') function = transcript.qualifiers.get('note', [''])[0].replace(";", ' ') strand = strand_dict[feature.location.strand] cds = "%d:%d" % (transcript.location.start, transcript.location.end) if(strand == '+'): distance = transcript.location.start - gene.location.start else: distance = gene.location.end - transcript.location.end newint = construct_gff_interval(chrom, gene.location.start, gene.location.end, transcript.type, score='0', strand=strand, attrs=[('ID', geneid), ('genesymbol', genesymbol), ('annotation', annotation), ('function', function), ('cds', cds), ('distance', distance), ('tss_variants', '1') ]) sys.stdout.write(str(newint)) #if('CDS' in [x.type for x in v]): #print() #print("-"*140) #print() #for el in v: #print(el)
#print(len(name2gene)) for ga in gene_annotation_list: parent_name = ga.attrs.get('Parent') if (parent_name): parent = name2gene.get(ga.attrs['Parent']) if (parent): if (parent.strand == '+'): distance = ga.start - parent.start else: distance = parent.stop - ga.stop distance = 0 aint = construct_gff_interval( parent.chrom, parent.start, parent.stop, 'gene', score='0', strand=parent.strand, source='ncbi_af', frame='.', attrs=[('ID', parent.attrs['ID']), ('genesymbol', parent.attrs.get('gene', 'None')), ('annotation', ga.attrs.get('Note', 'None')), ('product', ga.attrs.get('product', 'None')), ('cds', "%d:%d" % (ga.start + 1, ga.stop)), ('tss_variants', '1'), ('distance', str(distance))]) sys.stdout.write(str(aint))
required=True, type=str, help="Path to the genome, fasta format") parser.add_argument('--gff', nargs='?', default=False, const=True, type=bool, help="If set, output is in gff format") args = parser.parse_args() genome = next(SeqIO.parse(args.genome, 'fasta')).seq for interval in BedTool(args.path): seq = str(genome[interval.start:interval.end].upper()) if (args.gff): sys.stdout.write( str( construct_gff_interval(interval.chrom, interval.start, interval.end, 'un', score=interval.score, strand=interval.strand, source='un', frame='.', attrs=[('Name', interval.name), ('seq', seq)]))) else: print("\t".join([str(x) for x in list(interval) + [seq]]))
bacteria2prophages[prophage.bacteria].append(prophage) for bname, bannotation, blength in bacterial_annotation: for prophage in sorted(bacteria2prophages[bname], key=lambda x: x.start): scale = 1000 / blength host_start = round(prophage.start * scale) host_end = round(prophage.end * scale) + 1 interval = construct_gff_interval( bname, prophage.start, prophage.end, 'prophage', score='0', strand=prophage.strand, source='ovidiu', frame='.', attrs=[('host_length', str(blength)), ('host_family', bannotation), ('Name', "_".join([ str(x) for x in (bname, prophage.start, prophage.end) ])), ('rlength', "%1.4f" % ((prophage.end - prophage.start) / blength)), ('integrase', str(prophage.integrase)), ('host_start', "%d" % host_start), ('host_end', "%d" % host_end)]) if (host_start > 0 and host_end < 1000): sys.stdout.write(str(interval)) else: sys.stderr.write( "\nProphage region is outside bacterial borders\n%s" % interval)
parser = argparse.ArgumentParser(description='Converts transcripts raw file into ncbi format'); parser.add_argument('--transcripts', nargs = '?', required=True, type = str, help = "Path to the transcripts, raw format"); parser.add_argument('--genome', nargs = '?', required=True, type = str, help = "Path to the genome, fasta format"); args = parser.parse_args(); with open(args.genome) as f: chrom = next(f).strip()[1:] with open(args.transcripts) as f: next(f); for l in f: a = [x.strip().replace('"', '').replace(';', '') for x in l.strip().split("\t")] aint = construct_gff_interval( chrom, int(a[2]), int(a[3]), 'gene', score='0', strand=a[4], source='ncbi_af', frame='.', attrs=[ ('ID', 'gene-%s' % a[0]), ('gene_biotype', 'protein_coding'), ('Name', 'gene-%s' % a[0]), ('product', a[1]) ] ); sys.stdout.write(str(aint)) #453-Cg-phage-CL31_S7_L001_R1_001_contig_1 un gene 2 2422 0 + . ID=gene-CL_1_Hyp.; gene_biotype=protein_coding; Name=gene-CL_1_Hyp.; product=Protein (A. faecalis) # ['CL_53', 'Hyp. Protein (Corynebacterium)', '44425', '44796', '-'] #aint = construct_gff_interval(parent.chrom, parent.start, parent.stop, 'gene', score='0', strand=parent.strand, source='ncbi_af', frame='.', attrs=[('ID', parent.attrs['ID']), ('genesymbol', parent.attrs.get('gene', 'None')), ('annotation', ga.attrs.get('Note', 'None')), ('product', ga.attrs.get('product', 'None')), ('cds', "%d:%d" % (ga.start+1, ga.stop)), ('tss_variants', '1'), ('distance', str(distance)) ] );
help="adfsa") args = parser.parse_args() if (os.stat(args.path).st_size != 0): coverage_dict = coverage2dict(args.coverage) convolution = list(coverage2dict(args.convolution).values())[0] for peak in BedTool(args.path): top = int(peak.name) topcoverage = coverage_dict[peak.chrom][top] total_coverage = coverage_dict[peak.chrom][peak.start:peak.stop] newint = construct_gff_interval(peak.chrom, peak.start, peak.stop, 'binding_peak', score=peak.score, strand=peak.strand, source='un', frame='.', attrs=[("Name", peak.name), ("topcoverage", topcoverage), ("total_coverage", total_coverage)]) if (max(total_coverage) >= 1.1 * topcoverage): fontsize = 24 fig, ax1 = plt.subplots(figsize=(16, 9)) plt.tight_layout(rect=[0.1, 0.1, 0.95, 0.95]) ax1.plot(total_coverage, 'b-') ax1.plot(top - peak.start, topcoverage, 'r*', linewidth=5) ax1.set_xlabel("position (nt)", fontsize=fontsize) ax1.set_ylabel('coverage', color='b', fontsize=fontsize) ax1.tick_params('y', colors='b')
else: d = d_plus s_strand = '+' cs = cs_plus peak2genestarts[interval.name] = start2genes[cs], d, s_strand for interval in peaks: topcoverage, ldrop, rdrop, start, end = peak2cov[interval.name] lg = peak2genes[interval.name] sg, dg, s_strand = peak2genestarts[interval.name] anint = construct_gff_interval(interval.chrom, start, end, 'peak', score=interval.score, strand=interval.strand, source='af_peak_detection', frame='.', attrs=[('Name', interval.name), ('topcoverage', "%1.3f" % topcoverage), ('ldrop', "%1.3f" % ldrop), ('rdrop', "%1.3f" % rdrop), ('genes', ",".join(lg)), ('start_genes', ",".join(sg)), ('start_gene_distance', dg), ('start_gene_strand', s_strand)]) sys.stdout.write(str(anint)) #construct_gff_interval(chrom, start, stop, feature, score='0', strand='.', source='un', frame='.', attrs=[]);
parser.add_argument('path', metavar='N', nargs='?', type=str, help="Path to the genbank file") args = parser.parse_args() strand_dict = {1: '+', -1: '-'} #NC_003450.3 RefSeq gene 4766 5302 . + . ID=gene-NCgl0004;Dbxref=GeneID:1021292;Name=NCgl0004;gbkey=Gene;gene_biotype=protein_coding;gene_synonym=Cgl0005;locus_tag=NCgl0004 for seqrecord in SeqIO.parse(args.path, 'genbank'): chrom = seqrecord.name chrom = '453-Cg-phage-CL31_S7_L001_R1_001_contig_1' for feature in seqrecord.features: if (feature.type == 'CDS'): temp = feature.qualifiers['label'][0].replace(";", ":") a = temp.split(" ") geneid = "gene-%s" % a[0] product = " ".join(a[1:]) newint = construct_gff_interval( chrom, feature.location.start, feature.location.end, 'gene', score='0', strand=strand_dict[feature.location.strand], attrs=[('ID', geneid), ('gene_biotype', 'protein_coding'), ('Name', geneid), ('product', product)]) sys.stdout.write(str(newint))
#NC_003888.3 RefSeq gene 14044 14454 . - . ID=gene-SCO0012;Dbxref=GeneID:1095452;Name=SCO0012;gbkey=Gene;gene_biotype=protein_coding;gene_synonym=SCJ30.07c;locus_tag=SCO0012 #NC_003888.3 RefSeq CDS 14044 14454 . - 0 ID=cds-NP_624373.1;Parent=gene-SCO0012;Dbxref=Genbank:NP_624373.1,GeneID:1095452;Name=NP_624373.1;Note=SCJ30.07c%2C unknown%2C len: 136 aa%3B predicted by GC Frameplot%2C Hidden Markov model and amino acid usage.;gbkey=CDS;locus_tag=SCO0012;product=hypothetical protein;protein_id=NP_624373.1;transl_table=11 # phiSco2_Bielefeld Prodigal:2.6 CDS 1400 2647 . - 0 ID=phiSco2_00003;Parent=phiSco2_00003_gene;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:Viruses2.aa:YP_009208303.1;locus_tag=phiSco2_00003;product=hypothetical protein SEA_AMELA_25 for interval in BedTool(args.path): if (interval[2] == 'CDS'): name = 'gene-' + interval.attrs['ID'] gene = construct_gff_interval(interval.chrom, interval.start, interval.stop, 'gene', score='0', strand=interval.strand, source='custom', frame='.', attrs=[('ID', name), ('gene_biotype', 'protein_coding') ]) sys.stdout.write(str(gene)) cds = construct_gff_interval( interval.chrom, interval.start, interval.stop, 'CDS', score='0', strand=interval.strand, source='custom', frame='.',
nargs='?', type=str, help="Path to the ncbi annotation file, gff format") args = parser.parse_args() curgene = None cds_list = [] for interval in BedTool(args.path): if (interval[2] == 'gene'): curgene = interval if (interval[2] == 'CDS'): aint = construct_gff_interval( interval.chrom, interval.start, interval.stop, 'cds', score='0', strand=interval.strand, source='.', frame='.', attrs=[('ID', interval.attrs['Parent']), ('genesymbol', curgene.attrs['Name']), ('annotation', interval.attrs.get('Note', 'None')), ('product', interval.attrs.get('product', 'None'))]) cds_list.append(aint) cds_list.sort(key=lambda x: x.start) for aint in cds_list: sys.stdout.write(str(aint))