def t_possible_limits(self): """Calculate possible queries to limit a GFF file. """ gff_examiner = GFFExaminer() possible_limits = gff_examiner.available_limits(self._test_gff_file) print pprint.pprint(possible_limits)
def editGBrowseEntry(gffFile, dbName, organismDir, organismName): examiner = GFFExaminer() gffHandle = open(gffFile) landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0] gbrowseConf = os.path.join(GBROWSE_DIR, organismDir.lower() + '.conf') if (os.path.isfile(gbrowseConf)): conf = open(gbrowseConf, 'r') confLines = conf.readlines() conf.close() changedInitial = False changedExample = False for(counter, line) in enumerate(confLines): if (line[:15] == 'initial landmark'): initialLandmarkArr = line.split("=") initialLandmarkArr[1] = ' ' + landmark + ':1..50,000\n' confLines[counter] = '='.join(initialLandmarkArr) changedInitial = True elif(line[:8] == 'examples'): exampleArr = line.split("=") exampleArr[1] = ' ' + landmark + '\n' confLines[counter] = '='.join(exampleArr) changedExample = True if (changedInitial and changedExample): break conf = open(gbrowseConf, 'w+b') conf.writelines(confLines) conf.close() else: dataSource = os.path.join(os.path.dirname(gffFile), dbName) createNewGBrowseEntry(landmark, dataSource, organismDir, organismName)
def t_parent_child(self): """Summarize parent-child relationships in a GFF file. """ gff_examiner = GFFExaminer() pc_map = gff_examiner.parent_child_map(self._test_gff_file) print pprint.pprint(pc_map)
def count_promoters(in_file, out_file): """this function creates a text file detailing the number of promoters in an input GFF file """ examiner = GFFExaminer() # open input GFF file in_handle = open(in_file, "r") # output a text file, giving information such as no. of promoters in the file with open(out_file, "w") as fout: fout.write(pformat(examiner.available_limits(in_handle))) in_handle.close()
def explore_gff(self, gff_path): from BCBio.GFF import GFFExaminer examiner = GFFExaminer() with open(gff_path) as h: parentchild = examiner.parent_child_map(h) pprint.pprint(parentchild) with open(gff_path) as h: pprint.pprint(examiner.available_limits(h))
def examine_gff_file(gff_file): """ Examine GFF file :param gff_file: :return: """ examiner = GFFExaminer() in_file = open(gff_file) pprint.pprint(examiner.available_limits(in_file)) in_file.close()
def check(self): try: examiner = GFFExaminer() in_handle = open(self.fileName) examiner.available_limits(in_handle) #print("\nType of file detected:", "gff", "\n") in_handle.close() return "gff" except AssertionError: return None
def t_examiner_with_fasta(self): """Perform high level examination of files with FASTA directives. """ examiner = GFFExaminer() pc_map = examiner.parent_child_map(self._gff_file) assert pc_map[('UCSC', 'mRNA')] == [('UCSC', 'CDS')] limits = examiner.available_limits(self._gff_file) assert limits['gff_id'].keys()[0][0] == 'chr17' assert sorted(limits['gff_source_type'].keys()) == \ [('UCSC', 'CDS'), ('UCSC', 'mRNA')]
def stats(in_file): """run analysis of GFF file and produce a summary of feature types""" examiner = GFFExaminer() in_handle = open(in_file) print(f"\nrunning analysis of GFF file\n") pprint.pprint(examiner.available_limits(in_handle)) print("\n\n") in_handle.close() sys.exit(0)
def set_preview(self): """Summary""" try: exam = GFFExaminer() handle = open(self.path, encoding="utf-8", errors="ignore") gff_type = exam.available_limits(handle)['gff_type'] for entity in gff_type: self.entities.append(entity[0]) handle.close() except Exception as e: self.error = True self.error_message = "Malformated GFF ({})".format(str(e)) traceback.print_exc(file=sys.stdout)
def get_entities(self): """ get all the entities present in a gff file :return: The list of all the entities :rtype: List """ exam = GFFExaminer() handle = open(self.path, encoding="utf-8", errors="ignore") entities = [] gff_type = exam.available_limits(handle)['gff_type'] for ent in gff_type: entities.append(ent[0]) handle.close() return entities
def parse_gff(in_file): examiner = GFFExaminer() in_handle = open(in_file) gff = examiner.available_limits(in_handle) gff_features = gff['gff_type'] # print(gff_features) for feature in gff_features: if 'exon' in feature: # print(feature.sub_features) exonNo=gff_features[feature] if 'gene' in feature: geneNo = gff_features[feature] in_handle.close() return exonNo,geneNo
def addUnknownCvTerms(self, dbInfo={'user' : 'oberliat', 'password' : 'password', 'db' : 'chado'}): examiner = GFFExaminer() file = open(self.filename) try: conn = psycopg2.connect(database=dbInfo['db'], user=dbInfo['user'], password=dbInfo['password'], host='localhost') cur = conn.cursor() except Exception, e: self.error = True self.error_msg = "Unable to connect to the database " + dbInfo['db'] sys.exit(1)
def check_gff(infile): # GFF overview print("GFF overview:\n") examiner = GFFExaminer() in_handle = open(infile) pprint.pprint(examiner.available_limits(in_handle)) in_handle.close() print("") # Load GFF and its sequences gff = GFF.parse(infile) # Check qualifiers for rec in gff: print( "Example of the GFF's first line available qualifiers from the 9th column:\n" ) print(rec.features[0]) print( "\nPlease select only one of the available qualifiers to be used as gene identification!" ) exit()
def t_parent_child_file_modes(self): """Summarize parent-child relationships in a GFF file. """ gff_examiner = GFFExaminer() # Use the loaded-from-filename as reference pc_map = gff_examiner.parent_child_map(self._test_gff_file) with open(self._test_gff_file, "rt") as handle: assert pc_map == gff_examiner.parent_child_map(handle) with open(self._test_gff_file, "rb") as handle: if six.PY2: assert pc_map == gff_examiner.parent_child_map(handle) else: try: gff_examiner.parent_child_map(handle) except TypeError as e: assert str( e) == "input handle must be opened in text mode", e else: assert False, "expected TypeError to be raised"
import pprint import sys from BCBio.GFF import GFFExaminer with open(sys.argv[1], 'r') as handle: examiner = GFFExaminer() pprint.pprint(examiner.parent_child_map(handle))
import pprint from BCBio.GFF import GFFExaminer in_file = "Nagalakshmi_2008_UTRs.gff3" examiner = GFFExaminer() in_handle = open(in_file) pprint.pprint(examiner.parent_child_map(in_handle)) in_handle.close() from BCBio import GFF in_file = "Nagalakshmi_2008_UTRs.gff3" in_handle = open(in_file) for rec in GFF.parse(in_handle): print rec in_handle.close()
def examine_gff(gff_file): examiner = GFFExaminer() in_handle = open(gff_file) pprint.pprint(examiner.available_limits(in_handle)) print("") in_handle.close()
def get_seq(d): fo = open("G:/master_2/2eme semestre_project/halima__/infos/"+d, "w") c=0 erra=dict() id_spe=0 alll=dict() all_blocks=dict() #in_file = "/homes/biertank/halima/Downloads/halima_saker_project_/database/"+d in_file = "G:/master_2/2eme semestre_project/halima__/"+d examiner = GFFExaminer() in_handle = open(in_file) for rec in GFF.parse(in_handle): t=0 #print c for record in rec.features: elem_metadatas = list() keyss=('clst_id', 'SubjectScore','SubjectOrganism') #print record.type for key in sorted(record.qualifiers.keys()): if key in keyss: if key=="SubjectOrganism": load_profile = open('G:/master_2/2eme semestre_project/halima__/speciess.gff') len_org=len(record.qualifiers["SubjectOrganism"]) read_it = load_profile.read() myLine =list() myscore=list() purse=dict() for val in record.qualifiers[key]: i=1 for line in read_it.splitlines(): if line == record.__dict__['qualifiers']['source'][0]: id_spe=i if line == val: myLine.append(i) #print (val, " ------------------------- ",i) break i=i+1 elem_metadatas.append(str(key) + '=' + str(myLine)) else: elem_metadatas.append(str(key) + '=' + str(record.qualifiers[key])) load_profile.close() myscore=record.qualifiers["SubjectScore"] for i in range(0,len(myLine)): if float(myscore[i])>0.5: purse[myLine[i]]=str(myscore[i]) alll[rec.id,record.qualifiers["clst_id"][0]]=purse for i in range(0,len(myLine)): t=t+float(myscore[i]) print (purse) min=0 max=1 u=0 z=1 lst=sorted(purse.keys()) blocks=dict() list_elmt=list() for u in range(0,len(lst)-1): #print lst[u] #print min, "--", max if lst[max]-lst[max-1]<=4: #print u if max-min==1: #print (u,"-----------------------") list_elmt.append(lst[min]) list_elmt.append(lst[max]) else: list_elmt.append(lst[max]) max=max+1 else: if max-min>1: blocks[z]=list_elmt list_elmt=list() z=z+1 min=max max=max+1 all_blocks[rec.id]=blocks #pprint.pprint(purse) c=c+1 #print (c,"\n",rec.id,"\n",d,"\n",id_spe) #print blocks bboolean="true" for w in range (1,len(blocks)+1): if id_spe in blocks[w]: ind_block_spe=w len_block_spe=len(blocks[w]) break else: w=0 pp=0 #print rec.id #print w #print len_block_spe,"_____",len_org tot=0 inter_block=list() inter_org=list() if w==0: bboolean="false" #print id_spe,"_____________________________________________________________" else: if float(len_block_spe)/float(len_org) >= 0.5: bboolean="true" # print id_spe,"_____________________________________________________________" else: for f in range(1,len(blocks)+1): if f != w: for j in blocks[f]: #print float(purse[j]) tot=tot+float(purse[j]) if tot/len(blocks[f])>0.45: #print tot/len(blocks[f]) #print id_spe,"_____________________________________________________________" #print tot/len(blocks[f]) inter_block.append(str(f)) #print blocks[f] #if sum(x for x in (myscore[]))/len(blocks[pp]) < 0.45 tot=0 for ff in myLine: lstt=sorted(purse.keys()) #print lstt #print len(lstt) #print len(myLine) if ff not in lstt: if purse[ff]>0.45 : inter_org.append(str(ff)) #print inter_block,"___",inter_org if len(inter_block)==0 and len(inter_org)==0: bboolean="true" else: #print id_spe,"_____________________________________________________________" bboolean="false" #print ind_block_spe #print rec.id,"____",len_org, "__",len_block_spe, "__",id_spe, "__",blocks,"___",sorted(purse.iterkeys()),"____",len(blocks) #print alll #fo.write(str(all_blocks)) #fo.write(rec.id) #fo.write("") if bboolean=="false": erra[rec.id]=record.__dict__['qualifiers']['source'][0] # print bboolean, "____",w #print dir(rec) #print record.__dict__['qualifiers']['source'] print (erra) if erra: for idd in erra: org_pos=dict() fol = open("G:/master_2/2eme semestre_project/halima__/fasta_file/"+idd+".fas", "w") i=0 print (erra[idd]) for line in fileinput.input(['C:/Users/User/Desktop/Desktop/profiles.gff']): values = line.split("\t") if values[0]==idd:# in values: org_pos[values[3]]=values[1] #fo.write(idd+">\n"+values[1]) print (line) #print ("___________"+str(i)+"____________") #break print (org_pos) for za in org_pos:#range(0, len(org_pos)): print ("sequence__________::::") for line in fileinput.input(['C:/Users/User/Desktop/Desktop/org.gff']): valuees = line.split("\t") if valuees[0]==za: chrom=valuees[3][:-4] start_end=org_pos[za].split("..") #if start_end[0]>start_end[1]: startt=start_end[0] endd=start_end[1] # else: # startt=start_end[0] # endd=start_end[1] handle = Entrez.efetch(db="nuccore", id=chrom, rettype="gb", retmode="text", seq_start=startt, seq_stop=endd ) whole_sequence = SeqIO.read(handle, "genbank") print (whole_sequence.seq) fol.write(">"+org_pos[za]+"\n"+whole_sequence.seq+"\n") fol.close() in_handle.close() fo.close()
import pprint from BCBio.GFF import GFFExaminer from BCBio import GFF in_file = "Homo_sapiens.GRCh38.91.chromosome.22.gff3" examiner = GFFExaminer() in_handle = open(in_file) pprint.pprint(examiner.available_limits(in_handle)) in_handle.close() limit_info = dict(gff_source=["ensembl"]) in_handle = open(in_file) for rec in GFF.parse(in_handle, limit_info=limit_info): print(rec.features) in_handle.close()
def load_data(n_row=None, cleaned=True): # https://lncipedia.org/download data_dict = { 'id': [], 'name': [], 'length': [], 'ratio_g': [], 'ratio_t': [], 'ratio_c': [], 'ratio_a': [], 'number_exons': [], 'chromosom': [], 'start_pos': [], 'end_pos': [], 'length_from_pos': [], 'number_introns': [], 'mean_exon_length': [], 'mfe': [] } fasta_data = SeqIO.parse("data/lncipedia_5_2.fasta", "fasta") bed_raw_data = BedTool('data/lncipedia.bed') examiner = GFFExaminer() in_handle = open('data/lncipedia_5_2_hg38.gff') annotation_data = {} for i, rec in enumerate(GFF.parse(in_handle)): # chromosom e.g. chr1 for feature in rec.features: # lncRNA eg. LNC1725 if not feature.type == 'lnc_RNA': break exon_locations = [] lnc_id = feature.id for sub_feature in feature.sub_features: if sub_feature.type == 'exon': exon = (sub_feature.location.start, sub_feature.location.end) exon_locations.append(exon) annotation_data[lnc_id] = exon_locations in_handle.close() bed_data = {} for record in bed_raw_data: bed_data[record.name] = { 'number_exons': int(record.fields[9]), 'chromosom': record.fields[0], 'start_pos': int(record.fields[1]), # im bed -1 im vgl zu gff und online 'end_pos': int(record.fields[2]) } for i, record in enumerate(fasta_data): length = len(record.seq) data_dict['length'].append(length) data_dict['id'].append(record.id) data_dict['name'].append(record.name) if record.name in bed_data: for bed_feature in [ 'number_exons', 'chromosom', 'start_pos', 'end_pos' ]: data_dict[bed_feature].append( bed_data[record.name][bed_feature]) end_pos = bed_data[record.name]['end_pos'] start_pos = bed_data[record.name]['start_pos'] exon_locations = annotation_data[record.id] data_dict['length_from_pos'].append(end_pos - start_pos) data_dict['number_introns'].append( calc_number_introns(start_pos, end_pos, exon_locations)) data_dict['mean_exon_length'].append( calc_mean_exon_length(exon_locations)) else: for feature in [ 'number_exons', 'chromosom', 'start_pos', 'end_pos', 'length_from_pos', 'number_introns', 'mean_exon_length' ]: data_dict[feature].append(-1) count_g = 0 count_a = 0 count_t = 0 count_c = 0 for c in record.seq: if c == 'G': count_g += 1 elif c == 'T': count_t += 1 elif c == 'C': count_c += 1 elif c == 'A': count_a += 1 data_dict['ratio_g'].append(count_g / length * 100) data_dict['ratio_t'].append(count_t / length * 100) data_dict['ratio_c'].append(count_c / length * 100) data_dict['ratio_a'].append(count_a / length * 100) if n_row: if i == n_row: break list_of_lmfes = pickle.load(open("data/list_of_mfes2.pickle", "rb")) data_dict['mfe'].extend(list_of_lmfes) df = pd.DataFrame.from_dict(data_dict) # run only for rows where we have valid chromosomes df['chromosom'].loc[df['chromosom'] != -1] = df['chromosom'].loc[ df['chromosom'] != -1].apply(lambda x: x.split('chr')[1]) if cleaned: df = df[(df['chromosom'] != 'X') & (df['chromosom'] != 'Y')] df['chromosom'] = pd.to_numeric(df['chromosom']) # Also remove rows with invalid mfe and chromosomes df = df.loc[df['chromosom'] != -1].loc[ df['mfe'] != -1].iloc[:, 2:].apply(lambda x: (x - x.mean()) / x.std(), axis=0) return df
def __processGffFilesNew(self, newOrganismDirs): for newOrganism in newOrganismDirs: # start by creating the BLAST database newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism) print newOrganism organismFiles = os.walk(newOrganism).next()[2] faa = None ffn = None gff = None gbk = None for organismFile in organismFiles: extension = os.path.splitext(organismFile)[1] if (extension == '.ffn'): ffn = organismFile elif (extension == '.faa'): faa = organismFile elif (extension == '.gff'): gff = organismFile elif (extension == '.gbk'): gbk = organismFile if (faa and ffn and gff and gbk): break if (faa): GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True) self.report.addLogEntry('Ran formatdb successully on ' + faa) if (ffn): GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False) self.report.addLogEntry('Ran formatdb successully on ' + ffn) # process the gff and genbank files for creating the databases if (gff and gbk): # create the sqlite database for GBrowse and create the configuration file # for GBrowse hook up dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(newOrganism, dbName) gff = os.path.join(newOrganism, gff) parser = GenBank.RecordParser() gbk = os.path.join(newOrganism, gbk) record = parser.parse(open(gbk)) organismName = record.organism accession = record.accession[0] self.report.addLogEntry('Found organism name ' + organismName) # create a brand new GBrowse configuration file examiner = GFFExaminer() gffHandle = open(gff) landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0] gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession) '''gffRewriter.addUnknownCvTerms({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : settings.DATABASES['default']['NAME'] })''' gffRewriter.addColor({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : 'MyGO' }) error = gffRewriter.getError() print error gff = gff + ".sorted.prepared" args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) self.report.addLogEntry('Successfully created sqlite database for ' + str(gff)) organismDir = os.path.basename(newOrganism) self.report.addLogEntry('Added new GBrowse entry for ' + organismName) # now edit the record in Chado by first adding the organism and then adding # bulk loading the information from gff3 id = GenomeDBUtil.addOrganismToChado(gff, organismName) GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)
parser.add_argument("-p", "--prefix", action="store", dest="prefix", help="Prefix of output files", default="prefix") parser.add_argument("-l", "--length_distribution_file_prefix", action="store", dest="len_distr_file", help="Output file with lengths distibutions", default="length_distribution") args = parser.parse_args() examiner = GFFExaminer() with open(args.gff, "r") as in_fd: pprint.pprint(examiner.parent_child_map(in_fd)) with open(args.gff, "r") as in_fd: record_dict = dict([(record.id, record) for record in GFF.parse(in_fd)]) gene_dict = OrderedDict({}) for record_id in record_dict: for feature in record_dict[record_id].features: if feature.type == "gene": gene_dict[feature.qualifiers["Name"][0]] = OrderedDict({}) for sub_feature in feature.sub_features: gene_dict[feature.qualifiers["Name"][0]][ sub_feature.type] = len(sub_feature)
def gff3_to_feature(gff3, mapid_data, ftype): feature = [] examiner = GFFExaminer() fh = open(gff3, 'r+') for refseq_feature in GFF.parse(fh): # print(refseq_feature) refseq_id = refseq_feature.id refseq_obj_id = refseq_id if ('chromosome' in mapid_data and refseq_id in mapid_data['chromosome']): refseq_obj_id = mapid_data['chromosome'][refseq_id] if (ftype == 'chromosome'): feature.append({'name': refseq_feature.id, 'type': ftype}) continue if (ftype == 'gene' or ftype == 'mRNA'): for gene_feature in refseq_feature.features: # skip the chromosome feature for tripal if ((ftype == 'gene') and (gene_feature.type == 'gene')): feature_loc = parse_feature_location( refseq_id, refseq_obj_id, gene_feature.location) feature.append({ 'name': gene_feature.id, 'type': gene_feature.type, 'loc': feature_loc }) continue if (ftype == 'mRNA'): for mrna_feature in gene_feature.sub_features: feature_loc = parse_feature_location( refseq_id, refseq_obj_id, mrna_feature.location) sub_feature_list = [] gene_obj_id = gene_feature.id if ('gene' in mapid_data and gene_feature.id in mapid_data['gene']): gene_obj_id = mapid_data['gene'][gene_feature.id] for sub_feature in mrna_feature.sub_features: if (sub_feature.type == 'CDS'): sub_feature_loc = parse_feature_location( refseq_id, refseq_obj_id, sub_feature.location, phase=int( sub_feature.qualifiers['phase'][0])) sub_feature_list.append({ 'name': sub_feature.id, 'type': sub_feature.type, 'loc': sub_feature_loc }) else: sub_feature_loc = parse_feature_location( refseq_id, refseq_obj_id, sub_feature.location) sub_feature_list.append({ 'name': sub_feature.id, 'type': sub_feature.type, 'loc': sub_feature_loc }) feature.append({ 'name': mrna_feature.id, 'type': mrna_feature.type, 'loc': feature_loc, 'sub_features': sub_feature_list, 'parent': { '_id': gene_obj_id, 'name': gene_feature.id } }) #pprint.pprint(feature) #sys.exit() fh.close() return (feature)
def main(): opt = opt_check(get_optparser()) gfffile = opt.gfffile bedfile = opt.bedfile gffprefn = pybedtools.BedTool(gfffile).remove_invalid() gffd = dict() featurecluster = dict() featurenoparent = dict() examiner = GFFExaminer() outio = open(opt.outfile, 'w') openBED = open(bedfile, 'r') summitbedoutfile = "summit" + bedfile bedoutio = open(summitbedoutfile, 'w') for line in openBED: if line.startswith("#"): continue strings = line.strip().split("\t") peakchromosome = strings[0] peakstart = int(strings[1]) peakend = int(strings[2]) mid1 = int((peakstart + peakend) / 2) mid2 = int((peakstart + peakend) / 2) print(peakchromosome, mid1, mid2, sep="\t", file=bedoutio) bedoutio.close() derivesfeature = dict() for gffinf in gffprefn: # a = gffinf.fields attrs = gffinf.attrs # continue featuretype = gffinf[2] if "Parent" not in attrs: featurenoparent[featuretype] = 1 if featuretype in gffd: gffd[featuretype] += 1 else: gffd[featuretype] = 1 if "Derives_from" in attrs: derivesfeature[featuretype] = 1 # print (featuretype) # # pprint.pprint(featurenoparent) updown = list() overlap = list() skip = list() other = list() if opt.profile: profile = open(opt.profile, 'r') for lin in profile.readlines(): lin = lin.rstrip('\n') (typenow, inf) = lin.split(':') if typenow == 'updown': updown = inf.split(',') if typenow == 'overlap': overlap = inf.split(',') if typenow == 'skip': skip = inf.split(',') if typenow == 'other': other = inf.split(',') if typenow == 'makeintron': mkintron = inf else: for featuretype in gffd: while True: print("#" * 36) print("Find ", featuretype, gffd[featuretype], "in genome") print("please choose model: \n" "1) calculate up and downstream, \n" "2) overlap, \n" "3) skip, \n" "4) count this type as other") if featuretype == 'chromosome': choose = eval(input("suggest 3: ")) or 3 elif featuretype in derivesfeature: choose = eval(input("suggest 3: ")) or 3 elif featuretype in featurenoparent: choose = eval(input("suggest 1: ")) or 1 elif featuretype in ['exon', 'CDS', 'intron', 'five_prime_UTR', 'three_prime_UTR']: choose = eval(input("suggest 2: ")) or 2 else: choose = eval(input("suggest 2: ")) or 2 # choose = input() choose = int(choose) if choose == 1: updown.append(featuretype) print(featuretype, "calculate up and downstream") print() break elif choose == 2: overlap.append(featuretype) print(featuretype, "overlap") print() break elif choose == 3: skip.append(featuretype) print(featuretype, "skip this featuretype") print() break elif choose == 4: other.append(featuretype) print(featuretype, " count this featuretype as other") print() break else: print("Please input 1,2,3,4 model") print() if not mkintron: if 'intron' not in gffd: while True: print("Do not find intron annotation, suggest make intron annotation. y(es) or n(o)") intronyes = eval(input("suggest yes: ")) or 'yes' if intronyes == 'yes' or intronyes == 'y': mkintron = True break elif intronyes == 'no' or intronyes == 'n': mkintron = False break else: continue updonwfile = gfffile + "updown" updownio = open(updonwfile, 'w') gffinio = open(gfffile, 'r') for line in gffinio: if line.startswith("#"): continue line = line.rstrip('\n') linecontain = line.split("\t") if linecontain[2] in updown: print(line, file=updownio) updownio.close() if mkintron: gffinio1 = open(gfffile, 'r') print("make intron file") nointronfile = gfffile + "notinron" genefile = gfffile + "gene" nointronio = open(nointronfile, 'w') geneio = open(genefile, 'w') for line in gffinio1: if line.startswith("#"): continue line = line.rstrip('\n') linecontain = line.split("\t") if linecontain[2] in ['exon', 'CDS', 'five_prime_UTR', 'three_prime_UTR']: print(line, file=nointronio) if linecontain[2] in ['gene']: print(line, file=geneio) genefn = pybedtools.BedTool(genefile) nointronfn = pybedtools.BedTool(nointronfile) intronfn = genefn.subtract(nointronfn) intronfn.saveas('tmp_intron.gff') # intronfile = gfffile+"intron" intronin = open('tmp_intron.gff', 'r') # intronout = open (intronfile, 'w') gffinio2 = open(gfffile, 'r') overlapfile = gfffile + "overlap" overlapio = open(overlapfile, 'w') for line in gffinio2: if line.startswith("#"): continue line = line.rstrip('\n') linecontain = line.split("\t") if linecontain[2] in overlap: print(line, file=overlapio) if linecontain[2] in updown: print(line, file=overlapio) if linecontain[2] in other: print(line, file=overlapio) for line in intronin: line = line.rstrip('\n') b = line.replace('\tgene\t', '\tintron\t') print(b, file=overlapio) overlapio.close() overlap.append('intron') # os.remove('tmp_intron.gff') os.remove(genefile) # os.remove(nointronfile) else: gffinio2 = open(gfffile, 'r') overlapfile = gfffile + "overlap" overlapio = open(overlapfile, 'w') for line in gffinio2: if line.startswith("#"): continue line = line.rstrip('\n') linecontain = line.split("\t") if linecontain[2] in overlap: print(line, file=overlapio) if linecontain[2] in updown: print(line, file=overlapio) if linecontain[2] in other: print(line, file=overlapio) overlapio.close() print("updown", updown, file=outio) print("overlap", overlap, file=outio) print("skip", skip, file=outio) print("other", other, file=outio) overlapfn = pybedtools.BedTool(overlapfile).sort() updownfn = pybedtools.BedTool(updonwfile).sort() summitfn = pybedtools.BedTool(summitbedoutfile).sort() intergenic_summitfn = summitfn.subtract(overlapfn) intergenic_summitfn.saveas("intergenic_summitfn.txt") nearbyfn = intergenic_summitfn.closest(updownfn, d=True, stream=True) # nearbyfn.saveas("nearby.txt") d = defaultdict(set) bedfields = summitfn.field_count() type_idx = bedfields + 2 bedintersectgff = summitfn.intersect(overlapfn, wao=True) for feature in bedintersectgff: featuretype = feature[type_idx] key = '\t'.join(feature[:bedfields]) if featuretype in overlap: d[key].update([featuretype]) # print ("overlap") elif featuretype in updown: d[key].update([featuretype]) # print ("updown") elif featuretype in other: d[key].update(['other']) # print ("other") elif featuretype in skip: print(featuretype, "skip") # d[key].update(['.']) continue else: continue # d[key].update(['.']) npeaks = float(len(d)) count_d = defaultdict(int) for peak, featuretypes in list(d.items()): if featuretypes == set('.'): featuretype = 'unannotated' continue else: featuretype = labelfilter(featuretypes) count_d[featuretype] += 1 results = list(count_d.items()) # results.sort(key=lambda x: x[1]) results = sorted(results) labels, counts = list(zip(*results)) labels = [] counts_to_use = [] nearpeakd = defaultdict(set) for nearpeak in nearbyfn: # Chr,peakstart, peakend, genechr,genestart,geneend, genestrand # print (nearpeak[0],nearpeak[1], nearpeak[2],nearpeak[bedfields], nearpeak[bedfields+3],nearpeak[bedfields+4],nearpeak[bedfields+6]) peakkey = '\t'.join(nearpeak[:bedfields]) if peakkey in d: continue genestrand = nearpeak[bedfields + 6] distance = int(nearpeak[-1]) typenow = 'error' if distance == 0: continue if int(nearpeak[bedfields + 3]) <= int(nearpeak[1]) <= int(nearpeak[2]) <= int(nearpeak[bedfields + 4]): print("error") print(nearpeak[0], nearpeak[1], nearpeak[2], nearpeak[bedfields], nearpeak[bedfields + 3], nearpeak[bedfields + 4], nearpeak[bedfields + 6]) if genestrand == '+': if int(nearpeak[1]) >= int(nearpeak[bedfields + 4]): # typenow = 'downstrand' if distance <= 1000: typenow = nearpeak[bedfields + 2] + "_" + 'TTS_1000' elif distance <= 3000: typenow = nearpeak[bedfields + 2] + "_" + 'TTS_3000' else: typenow = 'intergentic' elif int(nearpeak[2]) <= int(nearpeak[bedfields + 3]): if distance <= 1000: typenow = nearpeak[bedfields + 2] + "_" + 'TSS_1000' elif distance <= 3000: typenow = nearpeak[bedfields + 2] + "_" + 'TSS_3000' else: typenow = 'intergentic' else: print("error", nearpeak[0], nearpeak[1], nearpeak[2], nearpeak[bedfields], nearpeak[bedfields + 3], nearpeak[bedfields + 4], nearpeak[bedfields + 6]) elif genestrand == '-': if int(nearpeak[1]) >= int(nearpeak[bedfields + 4]): # typenow = 'downstrand' if distance <= 1000: typenow = nearpeak[bedfields + 2] + "_" + 'TSS_1000' elif distance <= 3000: typenow = nearpeak[bedfields + 2] + "_" + 'TSS_3000' else: typenow = 'intergentic' elif int(nearpeak[2]) <= int(nearpeak[bedfields + 3]): if distance <= 1000: typenow = nearpeak[bedfields + 2] + "_" + 'TTS_1000' elif distance <= 3000: typenow = nearpeak[bedfields + 2] + "_" + 'TTS_3000' else: typenow = 'intergentic' else: print("error", nearpeak[0], nearpeak[1], nearpeak[2], nearpeak[bedfields], nearpeak[bedfields + 3], nearpeak[bedfields + 4], nearpeak[bedfields + 6]) else: print("error", nearpeak[0], nearpeak[1], nearpeak[2], nearpeak[bedfields], nearpeak[bedfields + 3], nearpeak[bedfields + 4], nearpeak[bedfields + 6]) nearpeakd[peakkey].update([typenow]) for peakid in nearpeakd: if peakid in d: print("error peakid in nearpeakd", peakid, nearpeakd[peakid], d[peakid]) for peakid in d: if peakid in nearpeakd: print("error peakid in d", peakid, nearpeakd[peakid], d[peakid]) discount = defaultdict(int) for peak, distypes in list(nearpeakd.items()): distype = labelfilter(distypes) discount[distype] += 1 disres = list(discount.items()) for label, count in results: print(label, count, file=outio) for label, count in disres: print(label, count, file=outio) outio.close()