Exemple #1
0
 def check(self):
     try:
         examiner = GFFExaminer()
         in_handle = open(self.fileName)
         examiner.available_limits(in_handle)
         #print("\nType of file detected:", "gff", "\n")
         in_handle.close()
         return "gff"
     except AssertionError:
         return None
Exemple #2
0
 def check(self):
     try:
         examiner = GFFExaminer()
         in_handle = open(self.fileName)
         examiner.available_limits(in_handle)
         #print("\nType of file detected:", "gff", "\n")
         in_handle.close()
         return "gff"
     except AssertionError: 
         return None
Exemple #3
0
 def t_possible_limits(self):
     """Calculate possible queries to limit a GFF file.
     """
     gff_examiner = GFFExaminer()
     possible_limits = gff_examiner.available_limits(self._test_gff_file)
     print
     pprint.pprint(possible_limits)
Exemple #4
0
def editGBrowseEntry(gffFile, dbName, organismDir, organismName):
    examiner = GFFExaminer()
    gffHandle = open(gffFile)
    landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
    gbrowseConf = os.path.join(GBROWSE_DIR, organismDir.lower() + '.conf')
    if (os.path.isfile(gbrowseConf)):
        conf = open(gbrowseConf, 'r')
        confLines = conf.readlines()
        conf.close()
        changedInitial = False
        changedExample = False
        for(counter, line) in enumerate(confLines):
            if (line[:15] == 'initial landmark'):
                initialLandmarkArr = line.split("=")
                initialLandmarkArr[1] = ' ' + landmark + ':1..50,000\n'
                confLines[counter] = '='.join(initialLandmarkArr)
                changedInitial = True
            elif(line[:8] == 'examples'):
                exampleArr = line.split("=")
                exampleArr[1] = ' ' + landmark + '\n'               
                confLines[counter] = '='.join(exampleArr)
                changedExample = True
            if (changedInitial and changedExample):
                break
        conf = open(gbrowseConf, 'w+b')
        conf.writelines(confLines)
        conf.close()             
    else:
        dataSource = os.path.join(os.path.dirname(gffFile), dbName)
        createNewGBrowseEntry(landmark, dataSource, organismDir, organismName)
def editGBrowseEntry(gffFile, dbName, organismDir, organismName):
    examiner = GFFExaminer()
    gffHandle = open(gffFile)
    landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
    gbrowseConf = os.path.join(GBROWSE_DIR, organismDir.lower() + '.conf')
    if (os.path.isfile(gbrowseConf)):
        conf = open(gbrowseConf, 'r')
        confLines = conf.readlines()
        conf.close()
        changedInitial = False
        changedExample = False
        for(counter, line) in enumerate(confLines):
            if (line[:15] == 'initial landmark'):
                initialLandmarkArr = line.split("=")
                initialLandmarkArr[1] = ' ' + landmark + ':1..50,000\n'
                confLines[counter] = '='.join(initialLandmarkArr)
                changedInitial = True
            elif(line[:8] == 'examples'):
                exampleArr = line.split("=")
                exampleArr[1] = ' ' + landmark + '\n'               
                confLines[counter] = '='.join(exampleArr)
                changedExample = True
            if (changedInitial and changedExample):
                break
        conf = open(gbrowseConf, 'w+b')
        conf.writelines(confLines)
        conf.close()             
    else:
        dataSource = os.path.join(os.path.dirname(gffFile), dbName)
        createNewGBrowseEntry(landmark, dataSource, organismDir, organismName)
    def explore_gff(self, gff_path):

        from BCBio.GFF import GFFExaminer
        examiner = GFFExaminer()
        with open(gff_path) as h:
            parentchild = examiner.parent_child_map(h)
            pprint.pprint(parentchild)
        with open(gff_path) as h:
            pprint.pprint(examiner.available_limits(h))
def count_promoters(in_file, out_file):
    """this function creates a text file detailing the number of promoters in an input GFF file """
    examiner = GFFExaminer()
    # open input GFF file
    in_handle = open(in_file, "r")
    # output a text file, giving information such as no. of promoters in the file
    with open(out_file, "w") as fout:
        fout.write(pformat(examiner.available_limits(in_handle)))
    in_handle.close()
Exemple #8
0
 def t_examiner_with_fasta(self):
     """Perform high level examination of files with FASTA directives.
     """
     examiner = GFFExaminer()
     pc_map = examiner.parent_child_map(self._gff_file)
     assert pc_map[('UCSC', 'mRNA')] == [('UCSC', 'CDS')]
     limits = examiner.available_limits(self._gff_file)
     assert limits['gff_id'].keys()[0][0] == 'chr17'
     assert sorted(limits['gff_source_type'].keys()) == \
             [('UCSC', 'CDS'), ('UCSC', 'mRNA')]
Exemple #9
0
def examine_gff_file(gff_file):
    """
    Examine GFF file
    :param gff_file:
    :return:
    """
    examiner = GFFExaminer()
    in_file = open(gff_file)
    pprint.pprint(examiner.available_limits(in_file))
    in_file.close()
def stats(in_file):
    """run analysis of GFF file and produce a summary of feature types"""

    examiner = GFFExaminer()
    in_handle = open(in_file)

    print(f"\nrunning analysis of GFF file\n")

    pprint.pprint(examiner.available_limits(in_handle))
    print("\n\n")
    in_handle.close()
    sys.exit(0)
Exemple #11
0
    def set_preview(self):
        """Summary"""
        try:
            exam = GFFExaminer()
            handle = open(self.path, encoding="utf-8", errors="ignore")
            gff_type = exam.available_limits(handle)['gff_type']
            for entity in gff_type:
                self.entities.append(entity[0])

            handle.close()
        except Exception as e:
            self.error = True
            self.error_message = "Malformated GFF ({})".format(str(e))
            traceback.print_exc(file=sys.stdout)
Exemple #12
0
    def get_entities(self):
        """
        get all the entities present in a gff file

        :return: The list of all the entities
        :rtype: List
        """
        exam = GFFExaminer()
        handle = open(self.path, encoding="utf-8", errors="ignore")
        entities = []
        gff_type = exam.available_limits(handle)['gff_type']
        for ent in gff_type:
            entities.append(ent[0])

        handle.close()

        return entities
Exemple #13
0
    def get_entities(self):
        """
        get all the entities present in a gff file

        :return: The list of all the entities
        :rtype: List
        """
        exam = GFFExaminer()
        handle = open(self.path, encoding="utf-8", errors="ignore")
        entities = []
        gff_type = exam.available_limits(handle)['gff_type']
        for ent in gff_type:
            entities.append(ent[0])

        handle.close()

        return entities
Exemple #14
0
def parse_gff(in_file):
    examiner = GFFExaminer()
    in_handle = open(in_file)

    gff = examiner.available_limits(in_handle)
    gff_features = gff['gff_type']
    # print(gff_features)
    for feature in gff_features:
        if 'exon' in feature:
            # print(feature.sub_features)
            exonNo=gff_features[feature]

        if 'gene' in feature:
            geneNo = gff_features[feature]

    in_handle.close()
    return exonNo,geneNo
Exemple #15
0
def check_gff(infile):

    # GFF overview
    print("GFF overview:\n")
    examiner = GFFExaminer()
    in_handle = open(infile)
    pprint.pprint(examiner.available_limits(in_handle))
    in_handle.close()
    print("")

    # Load GFF and its sequences
    gff = GFF.parse(infile)

    # Check qualifiers
    for rec in gff:
        print(
            "Example of the GFF's first line available qualifiers from the 9th column:\n"
        )
        print(rec.features[0])
        print(
            "\nPlease select only one of the available qualifiers to be used as gene identification!"
        )
        exit()
Exemple #16
0
	def __processGffFilesNew(self, newOrganismDirs):
		for newOrganism in newOrganismDirs:
			# start by creating the BLAST database
			newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism)
			print newOrganism
			organismFiles = os.walk(newOrganism).next()[2]
			faa = None
			ffn = None
			gff = None
			gbk = None
			for organismFile in organismFiles:
				extension = os.path.splitext(organismFile)[1]
				if (extension == '.ffn'):
					ffn = organismFile
				elif (extension == '.faa'):
					faa = organismFile
				elif (extension == '.gff'):
					gff = organismFile
				elif (extension == '.gbk'):
					gbk = organismFile
				if (faa and ffn and gff and gbk):
					break
			
			if (faa):
				GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True)
				self.report.addLogEntry('Ran formatdb successully on ' + faa)
			if (ffn):
				GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False)
				self.report.addLogEntry('Ran formatdb successully on ' + ffn)
				
			# process the gff and genbank files for creating the databases
			if (gff and gbk):
				# create the sqlite database for GBrowse and create the configuration file
				# for GBrowse hook up
				dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
				dbName = os.path.join(newOrganism, dbName)
				gff = os.path.join(newOrganism, gff)
				
				parser = GenBank.RecordParser()
				gbk = os.path.join(newOrganism, gbk)
				record = parser.parse(open(gbk))
				organismName = record.organism
				accession = record.accession[0]
				self.report.addLogEntry('Found organism name ' + organismName)
				
				# create a brand new GBrowse configuration file
				examiner = GFFExaminer()
				gffHandle = open(gff)
				landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
				
				gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession)
	
				'''gffRewriter.addUnknownCvTerms({
					'user' : settings.DATABASES['default']['USER'], 
					'password' : settings.DATABASES['default']['PASSWORD'], 
					'db' : settings.DATABASES['default']['NAME']
				})'''
			
				gffRewriter.addColor({
					'user' : settings.DATABASES['default']['USER'],
					'password' : settings.DATABASES['default']['PASSWORD'],
					'db' : 'MyGO'
				})
			
				error = gffRewriter.getError()
				print error
				
				gff = gff + ".sorted.prepared"
				
				args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
				runProgram('bp_seqfeature_load.pl', args)
				self.report.addLogEntry('Successfully created sqlite database for ' + str(gff))
				
				organismDir = os.path.basename(newOrganism)
				self.report.addLogEntry('Added new GBrowse entry for ' + organismName)
				
				# now edit the record in Chado by first adding the organism and then adding
				# bulk loading the information from gff3
				id = GenomeDBUtil.addOrganismToChado(gff, organismName)
				GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)
Exemple #17
0
def examine_gff(gff_file):
    examiner = GFFExaminer()
    in_handle = open(gff_file)
    pprint.pprint(examiner.available_limits(in_handle))
    print("")
    in_handle.close()
Exemple #18
0
import pprint
from BCBio.GFF import GFFExaminer
from BCBio import GFF

in_file = "Homo_sapiens.GRCh38.91.chromosome.22.gff3"
examiner = GFFExaminer()
in_handle = open(in_file)
pprint.pprint(examiner.available_limits(in_handle))
in_handle.close()

limit_info = dict(gff_source=["ensembl"])

in_handle = open(in_file)
for rec in GFF.parse(in_handle, limit_info=limit_info):
    print(rec.features)

in_handle.close()