def check(self): try: examiner = GFFExaminer() in_handle = open(self.fileName) examiner.available_limits(in_handle) #print("\nType of file detected:", "gff", "\n") in_handle.close() return "gff" except AssertionError: return None
def t_possible_limits(self): """Calculate possible queries to limit a GFF file. """ gff_examiner = GFFExaminer() possible_limits = gff_examiner.available_limits(self._test_gff_file) print pprint.pprint(possible_limits)
def editGBrowseEntry(gffFile, dbName, organismDir, organismName): examiner = GFFExaminer() gffHandle = open(gffFile) landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0] gbrowseConf = os.path.join(GBROWSE_DIR, organismDir.lower() + '.conf') if (os.path.isfile(gbrowseConf)): conf = open(gbrowseConf, 'r') confLines = conf.readlines() conf.close() changedInitial = False changedExample = False for(counter, line) in enumerate(confLines): if (line[:15] == 'initial landmark'): initialLandmarkArr = line.split("=") initialLandmarkArr[1] = ' ' + landmark + ':1..50,000\n' confLines[counter] = '='.join(initialLandmarkArr) changedInitial = True elif(line[:8] == 'examples'): exampleArr = line.split("=") exampleArr[1] = ' ' + landmark + '\n' confLines[counter] = '='.join(exampleArr) changedExample = True if (changedInitial and changedExample): break conf = open(gbrowseConf, 'w+b') conf.writelines(confLines) conf.close() else: dataSource = os.path.join(os.path.dirname(gffFile), dbName) createNewGBrowseEntry(landmark, dataSource, organismDir, organismName)
def explore_gff(self, gff_path): from BCBio.GFF import GFFExaminer examiner = GFFExaminer() with open(gff_path) as h: parentchild = examiner.parent_child_map(h) pprint.pprint(parentchild) with open(gff_path) as h: pprint.pprint(examiner.available_limits(h))
def count_promoters(in_file, out_file): """this function creates a text file detailing the number of promoters in an input GFF file """ examiner = GFFExaminer() # open input GFF file in_handle = open(in_file, "r") # output a text file, giving information such as no. of promoters in the file with open(out_file, "w") as fout: fout.write(pformat(examiner.available_limits(in_handle))) in_handle.close()
def t_examiner_with_fasta(self): """Perform high level examination of files with FASTA directives. """ examiner = GFFExaminer() pc_map = examiner.parent_child_map(self._gff_file) assert pc_map[('UCSC', 'mRNA')] == [('UCSC', 'CDS')] limits = examiner.available_limits(self._gff_file) assert limits['gff_id'].keys()[0][0] == 'chr17' assert sorted(limits['gff_source_type'].keys()) == \ [('UCSC', 'CDS'), ('UCSC', 'mRNA')]
def examine_gff_file(gff_file): """ Examine GFF file :param gff_file: :return: """ examiner = GFFExaminer() in_file = open(gff_file) pprint.pprint(examiner.available_limits(in_file)) in_file.close()
def stats(in_file): """run analysis of GFF file and produce a summary of feature types""" examiner = GFFExaminer() in_handle = open(in_file) print(f"\nrunning analysis of GFF file\n") pprint.pprint(examiner.available_limits(in_handle)) print("\n\n") in_handle.close() sys.exit(0)
def set_preview(self): """Summary""" try: exam = GFFExaminer() handle = open(self.path, encoding="utf-8", errors="ignore") gff_type = exam.available_limits(handle)['gff_type'] for entity in gff_type: self.entities.append(entity[0]) handle.close() except Exception as e: self.error = True self.error_message = "Malformated GFF ({})".format(str(e)) traceback.print_exc(file=sys.stdout)
def get_entities(self): """ get all the entities present in a gff file :return: The list of all the entities :rtype: List """ exam = GFFExaminer() handle = open(self.path, encoding="utf-8", errors="ignore") entities = [] gff_type = exam.available_limits(handle)['gff_type'] for ent in gff_type: entities.append(ent[0]) handle.close() return entities
def parse_gff(in_file): examiner = GFFExaminer() in_handle = open(in_file) gff = examiner.available_limits(in_handle) gff_features = gff['gff_type'] # print(gff_features) for feature in gff_features: if 'exon' in feature: # print(feature.sub_features) exonNo=gff_features[feature] if 'gene' in feature: geneNo = gff_features[feature] in_handle.close() return exonNo,geneNo
def check_gff(infile): # GFF overview print("GFF overview:\n") examiner = GFFExaminer() in_handle = open(infile) pprint.pprint(examiner.available_limits(in_handle)) in_handle.close() print("") # Load GFF and its sequences gff = GFF.parse(infile) # Check qualifiers for rec in gff: print( "Example of the GFF's first line available qualifiers from the 9th column:\n" ) print(rec.features[0]) print( "\nPlease select only one of the available qualifiers to be used as gene identification!" ) exit()
def __processGffFilesNew(self, newOrganismDirs): for newOrganism in newOrganismDirs: # start by creating the BLAST database newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism) print newOrganism organismFiles = os.walk(newOrganism).next()[2] faa = None ffn = None gff = None gbk = None for organismFile in organismFiles: extension = os.path.splitext(organismFile)[1] if (extension == '.ffn'): ffn = organismFile elif (extension == '.faa'): faa = organismFile elif (extension == '.gff'): gff = organismFile elif (extension == '.gbk'): gbk = organismFile if (faa and ffn and gff and gbk): break if (faa): GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True) self.report.addLogEntry('Ran formatdb successully on ' + faa) if (ffn): GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False) self.report.addLogEntry('Ran formatdb successully on ' + ffn) # process the gff and genbank files for creating the databases if (gff and gbk): # create the sqlite database for GBrowse and create the configuration file # for GBrowse hook up dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(newOrganism, dbName) gff = os.path.join(newOrganism, gff) parser = GenBank.RecordParser() gbk = os.path.join(newOrganism, gbk) record = parser.parse(open(gbk)) organismName = record.organism accession = record.accession[0] self.report.addLogEntry('Found organism name ' + organismName) # create a brand new GBrowse configuration file examiner = GFFExaminer() gffHandle = open(gff) landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0] gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession) '''gffRewriter.addUnknownCvTerms({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : settings.DATABASES['default']['NAME'] })''' gffRewriter.addColor({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : 'MyGO' }) error = gffRewriter.getError() print error gff = gff + ".sorted.prepared" args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) self.report.addLogEntry('Successfully created sqlite database for ' + str(gff)) organismDir = os.path.basename(newOrganism) self.report.addLogEntry('Added new GBrowse entry for ' + organismName) # now edit the record in Chado by first adding the organism and then adding # bulk loading the information from gff3 id = GenomeDBUtil.addOrganismToChado(gff, organismName) GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)
def examine_gff(gff_file): examiner = GFFExaminer() in_handle = open(gff_file) pprint.pprint(examiner.available_limits(in_handle)) print("") in_handle.close()
import pprint from BCBio.GFF import GFFExaminer from BCBio import GFF in_file = "Homo_sapiens.GRCh38.91.chromosome.22.gff3" examiner = GFFExaminer() in_handle = open(in_file) pprint.pprint(examiner.available_limits(in_handle)) in_handle.close() limit_info = dict(gff_source=["ensembl"]) in_handle = open(in_file) for rec in GFF.parse(in_handle, limit_info=limit_info): print(rec.features) in_handle.close()