Esempio n. 1
0
 def t_possible_limits(self):
     """Calculate possible queries to limit a GFF file.
     """
     gff_examiner = GFFExaminer()
     possible_limits = gff_examiner.available_limits(self._test_gff_file)
     print
     pprint.pprint(possible_limits)
def editGBrowseEntry(gffFile, dbName, organismDir, organismName):
    examiner = GFFExaminer()
    gffHandle = open(gffFile)
    landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
    gbrowseConf = os.path.join(GBROWSE_DIR, organismDir.lower() + '.conf')
    if (os.path.isfile(gbrowseConf)):
        conf = open(gbrowseConf, 'r')
        confLines = conf.readlines()
        conf.close()
        changedInitial = False
        changedExample = False
        for(counter, line) in enumerate(confLines):
            if (line[:15] == 'initial landmark'):
                initialLandmarkArr = line.split("=")
                initialLandmarkArr[1] = ' ' + landmark + ':1..50,000\n'
                confLines[counter] = '='.join(initialLandmarkArr)
                changedInitial = True
            elif(line[:8] == 'examples'):
                exampleArr = line.split("=")
                exampleArr[1] = ' ' + landmark + '\n'               
                confLines[counter] = '='.join(exampleArr)
                changedExample = True
            if (changedInitial and changedExample):
                break
        conf = open(gbrowseConf, 'w+b')
        conf.writelines(confLines)
        conf.close()             
    else:
        dataSource = os.path.join(os.path.dirname(gffFile), dbName)
        createNewGBrowseEntry(landmark, dataSource, organismDir, organismName)
Esempio n. 3
0
 def t_parent_child(self):
     """Summarize parent-child relationships in a GFF file.
     """
     gff_examiner = GFFExaminer()
     pc_map = gff_examiner.parent_child_map(self._test_gff_file)
     print
     pprint.pprint(pc_map)
Esempio n. 4
0
def editGBrowseEntry(gffFile, dbName, organismDir, organismName):
    examiner = GFFExaminer()
    gffHandle = open(gffFile)
    landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
    gbrowseConf = os.path.join(GBROWSE_DIR, organismDir.lower() + '.conf')
    if (os.path.isfile(gbrowseConf)):
        conf = open(gbrowseConf, 'r')
        confLines = conf.readlines()
        conf.close()
        changedInitial = False
        changedExample = False
        for(counter, line) in enumerate(confLines):
            if (line[:15] == 'initial landmark'):
                initialLandmarkArr = line.split("=")
                initialLandmarkArr[1] = ' ' + landmark + ':1..50,000\n'
                confLines[counter] = '='.join(initialLandmarkArr)
                changedInitial = True
            elif(line[:8] == 'examples'):
                exampleArr = line.split("=")
                exampleArr[1] = ' ' + landmark + '\n'               
                confLines[counter] = '='.join(exampleArr)
                changedExample = True
            if (changedInitial and changedExample):
                break
        conf = open(gbrowseConf, 'w+b')
        conf.writelines(confLines)
        conf.close()             
    else:
        dataSource = os.path.join(os.path.dirname(gffFile), dbName)
        createNewGBrowseEntry(landmark, dataSource, organismDir, organismName)
def count_promoters(in_file, out_file):
    """this function creates a text file detailing the number of promoters in an input GFF file """
    examiner = GFFExaminer()
    # open input GFF file
    in_handle = open(in_file, "r")
    # output a text file, giving information such as no. of promoters in the file
    with open(out_file, "w") as fout:
        fout.write(pformat(examiner.available_limits(in_handle)))
    in_handle.close()
    def explore_gff(self, gff_path):

        from BCBio.GFF import GFFExaminer
        examiner = GFFExaminer()
        with open(gff_path) as h:
            parentchild = examiner.parent_child_map(h)
            pprint.pprint(parentchild)
        with open(gff_path) as h:
            pprint.pprint(examiner.available_limits(h))
Esempio n. 7
0
def examine_gff_file(gff_file):
    """
    Examine GFF file
    :param gff_file:
    :return:
    """
    examiner = GFFExaminer()
    in_file = open(gff_file)
    pprint.pprint(examiner.available_limits(in_file))
    in_file.close()
Esempio n. 8
0
 def check(self):
     try:
         examiner = GFFExaminer()
         in_handle = open(self.fileName)
         examiner.available_limits(in_handle)
         #print("\nType of file detected:", "gff", "\n")
         in_handle.close()
         return "gff"
     except AssertionError: 
         return None
Esempio n. 9
0
 def t_examiner_with_fasta(self):
     """Perform high level examination of files with FASTA directives.
     """
     examiner = GFFExaminer()
     pc_map = examiner.parent_child_map(self._gff_file)
     assert pc_map[('UCSC', 'mRNA')] == [('UCSC', 'CDS')]
     limits = examiner.available_limits(self._gff_file)
     assert limits['gff_id'].keys()[0][0] == 'chr17'
     assert sorted(limits['gff_source_type'].keys()) == \
             [('UCSC', 'CDS'), ('UCSC', 'mRNA')]
Esempio n. 10
0
 def check(self):
     try:
         examiner = GFFExaminer()
         in_handle = open(self.fileName)
         examiner.available_limits(in_handle)
         #print("\nType of file detected:", "gff", "\n")
         in_handle.close()
         return "gff"
     except AssertionError:
         return None
Esempio n. 11
0
def stats(in_file):
    """run analysis of GFF file and produce a summary of feature types"""

    examiner = GFFExaminer()
    in_handle = open(in_file)

    print(f"\nrunning analysis of GFF file\n")

    pprint.pprint(examiner.available_limits(in_handle))
    print("\n\n")
    in_handle.close()
    sys.exit(0)
Esempio n. 12
0
    def set_preview(self):
        """Summary"""
        try:
            exam = GFFExaminer()
            handle = open(self.path, encoding="utf-8", errors="ignore")
            gff_type = exam.available_limits(handle)['gff_type']
            for entity in gff_type:
                self.entities.append(entity[0])

            handle.close()
        except Exception as e:
            self.error = True
            self.error_message = "Malformated GFF ({})".format(str(e))
            traceback.print_exc(file=sys.stdout)
Esempio n. 13
0
    def get_entities(self):
        """
        get all the entities present in a gff file

        :return: The list of all the entities
        :rtype: List
        """
        exam = GFFExaminer()
        handle = open(self.path, encoding="utf-8", errors="ignore")
        entities = []
        gff_type = exam.available_limits(handle)['gff_type']
        for ent in gff_type:
            entities.append(ent[0])

        handle.close()

        return entities
Esempio n. 14
0
def parse_gff(in_file):
    examiner = GFFExaminer()
    in_handle = open(in_file)

    gff = examiner.available_limits(in_handle)
    gff_features = gff['gff_type']
    # print(gff_features)
    for feature in gff_features:
        if 'exon' in feature:
            # print(feature.sub_features)
            exonNo=gff_features[feature]

        if 'gene' in feature:
            geneNo = gff_features[feature]

    in_handle.close()
    return exonNo,geneNo
Esempio n. 15
0
    def get_entities(self):
        """
        get all the entities present in a gff file

        :return: The list of all the entities
        :rtype: List
        """
        exam = GFFExaminer()
        handle = open(self.path, encoding="utf-8", errors="ignore")
        entities = []
        gff_type = exam.available_limits(handle)['gff_type']
        for ent in gff_type:
            entities.append(ent[0])

        handle.close()

        return entities
Esempio n. 16
0
 def addUnknownCvTerms(self, dbInfo={'user' : 'oberliat', 'password' : 'password', 'db' : 'chado'}):
     examiner = GFFExaminer()
     file = open(self.filename)
     
     try:
         conn = psycopg2.connect(database=dbInfo['db'], user=dbInfo['user'], password=dbInfo['password'], host='localhost')
         cur = conn.cursor()        
     except Exception, e:
         self.error = True
         self.error_msg = "Unable to connect to the database " + dbInfo['db']
         sys.exit(1)
Esempio n. 17
0
def check_gff(infile):

    # GFF overview
    print("GFF overview:\n")
    examiner = GFFExaminer()
    in_handle = open(infile)
    pprint.pprint(examiner.available_limits(in_handle))
    in_handle.close()
    print("")

    # Load GFF and its sequences
    gff = GFF.parse(infile)

    # Check qualifiers
    for rec in gff:
        print(
            "Example of the GFF's first line available qualifiers from the 9th column:\n"
        )
        print(rec.features[0])
        print(
            "\nPlease select only one of the available qualifiers to be used as gene identification!"
        )
        exit()
    def t_parent_child_file_modes(self):
        """Summarize parent-child relationships in a GFF file.
        """
        gff_examiner = GFFExaminer()
        # Use the loaded-from-filename as reference
        pc_map = gff_examiner.parent_child_map(self._test_gff_file)

        with open(self._test_gff_file, "rt") as handle:
            assert pc_map == gff_examiner.parent_child_map(handle)

        with open(self._test_gff_file, "rb") as handle:
            if six.PY2:
                assert pc_map == gff_examiner.parent_child_map(handle)
            else:
                try:
                    gff_examiner.parent_child_map(handle)
                except TypeError as e:
                    assert str(
                        e) == "input handle must be opened in text mode", e
                else:
                    assert False, "expected TypeError to be raised"
import pprint
import sys
from BCBio.GFF import GFFExaminer

with open(sys.argv[1], 'r') as handle:
    examiner = GFFExaminer()
    pprint.pprint(examiner.parent_child_map(handle))
Esempio n. 20
0
import pprint
from BCBio.GFF import GFFExaminer
 
in_file = "Nagalakshmi_2008_UTRs.gff3"
examiner = GFFExaminer()
in_handle = open(in_file)
pprint.pprint(examiner.parent_child_map(in_handle))
in_handle.close()

from BCBio import GFF
 
in_file = "Nagalakshmi_2008_UTRs.gff3"
 
in_handle = open(in_file)
for rec in GFF.parse(in_handle):
    print rec
in_handle.close()
Esempio n. 21
0
def examine_gff(gff_file):
    examiner = GFFExaminer()
    in_handle = open(gff_file)
    pprint.pprint(examiner.available_limits(in_handle))
    print("")
    in_handle.close()
Esempio n. 22
0
def get_seq(d):
    fo = open("G:/master_2/2eme semestre_project/halima__/infos/"+d, "w")
    c=0
    erra=dict()
    id_spe=0
    alll=dict()
    all_blocks=dict()
    #in_file = "/homes/biertank/halima/Downloads/halima_saker_project_/database/"+d
    in_file = "G:/master_2/2eme semestre_project/halima__/"+d
    examiner = GFFExaminer()
    in_handle = open(in_file)       
    for rec in GFF.parse(in_handle):
        t=0
        #print c
        for record in rec.features:
            elem_metadatas = list()
            keyss=('clst_id', 'SubjectScore','SubjectOrganism')
        #print record.type
            for key in sorted(record.qualifiers.keys()):
                if key in keyss:
                    if key=="SubjectOrganism":  
                        load_profile = open('G:/master_2/2eme semestre_project/halima__/speciess.gff')
                        len_org=len(record.qualifiers["SubjectOrganism"])               
                        read_it = load_profile.read()
                        myLine =list()
                        myscore=list()
                        purse=dict()
                        for val in record.qualifiers[key]:
                            i=1
                            for line in read_it.splitlines():
                                if line == record.__dict__['qualifiers']['source'][0]:
                                    id_spe=i
                                if line == val:
                                    myLine.append(i)
                                #print (val, "  -------------------------  ",i)
                                    break
                                i=i+1   
                        elem_metadatas.append(str(key) + '=' + str(myLine)) 
                    else:               
                        elem_metadatas.append(str(key) + '=' + str(record.qualifiers[key]))
            load_profile.close()
            
            myscore=record.qualifiers["SubjectScore"]
            for i in range(0,len(myLine)):
                if float(myscore[i])>0.5:
                    purse[myLine[i]]=str(myscore[i])    
            alll[rec.id,record.qualifiers["clst_id"][0]]=purse              
            for i in range(0,len(myLine)):                      
                t=t+float(myscore[i])
            print (purse)    
            min=0
            max=1
            u=0
            z=1
            lst=sorted(purse.keys())
            blocks=dict()
            list_elmt=list()
            for u in range(0,len(lst)-1):
                #print lst[u]
                #print min, "--",  max
                if lst[max]-lst[max-1]<=4:      
                    #print u        
                
                    if max-min==1:
                        #print (u,"-----------------------")
                        list_elmt.append(lst[min])
                        list_elmt.append(lst[max])
                    else:
                        list_elmt.append(lst[max])
                    max=max+1                       
                else:
                
                    if max-min>1:
                        blocks[z]=list_elmt
                        list_elmt=list()
                        z=z+1
                    min=max
                    max=max+1           
            
            all_blocks[rec.id]=blocks
            #pprint.pprint(purse)
        c=c+1
        
        
        
        
        
        
        
        
        
        
        #print (c,"\n",rec.id,"\n",d,"\n",id_spe)
        #print blocks
        bboolean="true"
        for w in range (1,len(blocks)+1):       
            if id_spe in blocks[w]:
                ind_block_spe=w
                len_block_spe=len(blocks[w])
                break
            else:
                w=0
        pp=0
        #print rec.id
        #print w
        #print len_block_spe,"_____",len_org
        tot=0
        inter_block=list()
        inter_org=list()        
        if w==0:
            bboolean="false"
            #print id_spe,"_____________________________________________________________"
        else:
            if float(len_block_spe)/float(len_org) >= 0.5:
                bboolean="true"
            #   print id_spe,"_____________________________________________________________"
            else:
                for f in range(1,len(blocks)+1):
                    if f != w:
                        for j in blocks[f]:
                            #print float(purse[j])
                            tot=tot+float(purse[j])
                        
                        if tot/len(blocks[f])>0.45:
                            #print tot/len(blocks[f])
                            #print id_spe,"_____________________________________________________________"
                            #print tot/len(blocks[f])
                            inter_block.append(str(f))
                            #print blocks[f]
                        #if sum(x for x in (myscore[]))/len(blocks[pp]) < 0.45
                        tot=0
                for ff in myLine:
                    
                    lstt=sorted(purse.keys())
                    #print lstt
                    #print len(lstt)
                    #print len(myLine)
                    if ff not in lstt:
                        if purse[ff]>0.45 :
                            
                            inter_org.append(str(ff))
                #print inter_block,"___",inter_org      
                if len(inter_block)==0 and len(inter_org)==0:
                    bboolean="true"
                else:
                    #print id_spe,"_____________________________________________________________"
                    bboolean="false"            
                        
                    
                            
                
                
                    
        #print ind_block_spe    
        #print rec.id,"____",len_org, "__",len_block_spe, "__",id_spe, "__",blocks,"___",sorted(purse.iterkeys()),"____",len(blocks)
        #print alll
        #fo.write(str(all_blocks))
        #fo.write(rec.id)
        #fo.write("")
        if bboolean=="false":
            erra[rec.id]=record.__dict__['qualifiers']['source'][0]    
    #   print bboolean, "____",w
    #print dir(rec)
    #print record.__dict__['qualifiers']['source']
        
    print (erra)   
    if erra:
            for idd in erra:
                org_pos=dict()
                fol = open("G:/master_2/2eme semestre_project/halima__/fasta_file/"+idd+".fas", "w")
                i=0
                print (erra[idd])
                for line in fileinput.input(['C:/Users/User/Desktop/Desktop/profiles.gff']):
                    values = line.split("\t")
                    if values[0]==idd:# in values:
                        org_pos[values[3]]=values[1]
                        #fo.write(idd+">\n"+values[1])
                        
                        print (line)
                        #print ("___________"+str(i)+"____________")
                        #break
        
                print (org_pos)   
                for za in org_pos:#range(0, len(org_pos)):
                    print ("sequence__________::::")
                    for line in fileinput.input(['C:/Users/User/Desktop/Desktop/org.gff']):
                         valuees = line.split("\t")
                         if valuees[0]==za:                             
                             chrom=valuees[3][:-4]
                             start_end=org_pos[za].split("..")
                             #if start_end[0]>start_end[1]:
                             startt=start_end[0]
                             endd=start_end[1]
                            # else:
                                # startt=start_end[0]
                                # endd=start_end[1]
                             handle = Entrez.efetch(db="nuccore",
							id=chrom,
							rettype="gb",
							retmode="text",
							seq_start=startt, 
							seq_stop=endd
							)
                             whole_sequence = SeqIO.read(handle, "genbank")
                             print (whole_sequence.seq)
                             fol.write(">"+org_pos[za]+"\n"+whole_sequence.seq+"\n")
                fol.close()
                

    in_handle.close()
    fo.close()
Esempio n. 23
0
import pprint
from BCBio.GFF import GFFExaminer
from BCBio import GFF

in_file = "Homo_sapiens.GRCh38.91.chromosome.22.gff3"
examiner = GFFExaminer()
in_handle = open(in_file)
pprint.pprint(examiner.available_limits(in_handle))
in_handle.close()

limit_info = dict(gff_source=["ensembl"])

in_handle = open(in_file)
for rec in GFF.parse(in_handle, limit_info=limit_info):
    print(rec.features)

in_handle.close()
Esempio n. 24
0
def load_data(n_row=None, cleaned=True):
    # https://lncipedia.org/download
    data_dict = {
        'id': [],
        'name': [],
        'length': [],
        'ratio_g': [],
        'ratio_t': [],
        'ratio_c': [],
        'ratio_a': [],
        'number_exons': [],
        'chromosom': [],
        'start_pos': [],
        'end_pos': [],
        'length_from_pos': [],
        'number_introns': [],
        'mean_exon_length': [],
        'mfe': []
    }
    fasta_data = SeqIO.parse("data/lncipedia_5_2.fasta", "fasta")
    bed_raw_data = BedTool('data/lncipedia.bed')
    examiner = GFFExaminer()
    in_handle = open('data/lncipedia_5_2_hg38.gff')
    annotation_data = {}
    for i, rec in enumerate(GFF.parse(in_handle)):
        # chromosom e.g. chr1
        for feature in rec.features:
            # lncRNA eg. LNC1725
            if not feature.type == 'lnc_RNA':
                break

            exon_locations = []
            lnc_id = feature.id
            for sub_feature in feature.sub_features:
                if sub_feature.type == 'exon':
                    exon = (sub_feature.location.start,
                            sub_feature.location.end)
                    exon_locations.append(exon)

            annotation_data[lnc_id] = exon_locations

    in_handle.close()
    bed_data = {}

    for record in bed_raw_data:
        bed_data[record.name] = {
            'number_exons': int(record.fields[9]),
            'chromosom': record.fields[0],
            'start_pos':
            int(record.fields[1]),  # im bed -1 im vgl zu gff und online
            'end_pos': int(record.fields[2])
        }

    for i, record in enumerate(fasta_data):
        length = len(record.seq)
        data_dict['length'].append(length)
        data_dict['id'].append(record.id)
        data_dict['name'].append(record.name)
        if record.name in bed_data:
            for bed_feature in [
                    'number_exons', 'chromosom', 'start_pos', 'end_pos'
            ]:
                data_dict[bed_feature].append(
                    bed_data[record.name][bed_feature])

            end_pos = bed_data[record.name]['end_pos']
            start_pos = bed_data[record.name]['start_pos']
            exon_locations = annotation_data[record.id]
            data_dict['length_from_pos'].append(end_pos - start_pos)
            data_dict['number_introns'].append(
                calc_number_introns(start_pos, end_pos, exon_locations))
            data_dict['mean_exon_length'].append(
                calc_mean_exon_length(exon_locations))
        else:
            for feature in [
                    'number_exons', 'chromosom', 'start_pos', 'end_pos',
                    'length_from_pos', 'number_introns', 'mean_exon_length'
            ]:
                data_dict[feature].append(-1)

        count_g = 0
        count_a = 0
        count_t = 0
        count_c = 0

        for c in record.seq:
            if c == 'G':
                count_g += 1
            elif c == 'T':
                count_t += 1
            elif c == 'C':
                count_c += 1
            elif c == 'A':
                count_a += 1

        data_dict['ratio_g'].append(count_g / length * 100)
        data_dict['ratio_t'].append(count_t / length * 100)
        data_dict['ratio_c'].append(count_c / length * 100)
        data_dict['ratio_a'].append(count_a / length * 100)

        if n_row:
            if i == n_row:
                break

    list_of_lmfes = pickle.load(open("data/list_of_mfes2.pickle", "rb"))
    data_dict['mfe'].extend(list_of_lmfes)

    df = pd.DataFrame.from_dict(data_dict)
    # run only for rows where we have valid chromosomes
    df['chromosom'].loc[df['chromosom'] != -1] = df['chromosom'].loc[
        df['chromosom'] != -1].apply(lambda x: x.split('chr')[1])
    if cleaned:
        df = df[(df['chromosom'] != 'X') & (df['chromosom'] != 'Y')]
        df['chromosom'] = pd.to_numeric(df['chromosom'])
        # Also remove rows with invalid mfe and chromosomes
        df = df.loc[df['chromosom'] != -1].loc[
            df['mfe'] != -1].iloc[:, 2:].apply(lambda x:
                                               (x - x.mean()) / x.std(),
                                               axis=0)

    return df
Esempio n. 25
0
	def __processGffFilesNew(self, newOrganismDirs):
		for newOrganism in newOrganismDirs:
			# start by creating the BLAST database
			newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism)
			print newOrganism
			organismFiles = os.walk(newOrganism).next()[2]
			faa = None
			ffn = None
			gff = None
			gbk = None
			for organismFile in organismFiles:
				extension = os.path.splitext(organismFile)[1]
				if (extension == '.ffn'):
					ffn = organismFile
				elif (extension == '.faa'):
					faa = organismFile
				elif (extension == '.gff'):
					gff = organismFile
				elif (extension == '.gbk'):
					gbk = organismFile
				if (faa and ffn and gff and gbk):
					break
			
			if (faa):
				GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True)
				self.report.addLogEntry('Ran formatdb successully on ' + faa)
			if (ffn):
				GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False)
				self.report.addLogEntry('Ran formatdb successully on ' + ffn)
				
			# process the gff and genbank files for creating the databases
			if (gff and gbk):
				# create the sqlite database for GBrowse and create the configuration file
				# for GBrowse hook up
				dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
				dbName = os.path.join(newOrganism, dbName)
				gff = os.path.join(newOrganism, gff)
				
				parser = GenBank.RecordParser()
				gbk = os.path.join(newOrganism, gbk)
				record = parser.parse(open(gbk))
				organismName = record.organism
				accession = record.accession[0]
				self.report.addLogEntry('Found organism name ' + organismName)
				
				# create a brand new GBrowse configuration file
				examiner = GFFExaminer()
				gffHandle = open(gff)
				landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
				
				gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession)
	
				'''gffRewriter.addUnknownCvTerms({
					'user' : settings.DATABASES['default']['USER'], 
					'password' : settings.DATABASES['default']['PASSWORD'], 
					'db' : settings.DATABASES['default']['NAME']
				})'''
			
				gffRewriter.addColor({
					'user' : settings.DATABASES['default']['USER'],
					'password' : settings.DATABASES['default']['PASSWORD'],
					'db' : 'MyGO'
				})
			
				error = gffRewriter.getError()
				print error
				
				gff = gff + ".sorted.prepared"
				
				args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
				runProgram('bp_seqfeature_load.pl', args)
				self.report.addLogEntry('Successfully created sqlite database for ' + str(gff))
				
				organismDir = os.path.basename(newOrganism)
				self.report.addLogEntry('Added new GBrowse entry for ' + organismName)
				
				# now edit the record in Chado by first adding the organism and then adding
				# bulk loading the information from gff3
				id = GenomeDBUtil.addOrganismToChado(gff, organismName)
				GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)
Esempio n. 26
0
parser.add_argument("-p",
                    "--prefix",
                    action="store",
                    dest="prefix",
                    help="Prefix of output files",
                    default="prefix")
parser.add_argument("-l",
                    "--length_distribution_file_prefix",
                    action="store",
                    dest="len_distr_file",
                    help="Output file with lengths distibutions",
                    default="length_distribution")

args = parser.parse_args()

examiner = GFFExaminer()

with open(args.gff, "r") as in_fd:
    pprint.pprint(examiner.parent_child_map(in_fd))

with open(args.gff, "r") as in_fd:
    record_dict = dict([(record.id, record) for record in GFF.parse(in_fd)])

gene_dict = OrderedDict({})
for record_id in record_dict:
    for feature in record_dict[record_id].features:
        if feature.type == "gene":
            gene_dict[feature.qualifiers["Name"][0]] = OrderedDict({})
            for sub_feature in feature.sub_features:
                gene_dict[feature.qualifiers["Name"][0]][
                    sub_feature.type] = len(sub_feature)
Esempio n. 27
0
def gff3_to_feature(gff3, mapid_data, ftype):

    feature = []

    examiner = GFFExaminer()

    fh = open(gff3, 'r+')
    for refseq_feature in GFF.parse(fh):
        # print(refseq_feature)
        refseq_id = refseq_feature.id
        refseq_obj_id = refseq_id
        if ('chromosome' in mapid_data
                and refseq_id in mapid_data['chromosome']):
            refseq_obj_id = mapid_data['chromosome'][refseq_id]

        if (ftype == 'chromosome'):
            feature.append({'name': refseq_feature.id, 'type': ftype})
            continue

        if (ftype == 'gene' or ftype == 'mRNA'):
            for gene_feature in refseq_feature.features:
                # skip the chromosome feature for tripal
                if ((ftype == 'gene') and (gene_feature.type == 'gene')):
                    feature_loc = parse_feature_location(
                        refseq_id, refseq_obj_id, gene_feature.location)
                    feature.append({
                        'name': gene_feature.id,
                        'type': gene_feature.type,
                        'loc': feature_loc
                    })
                    continue

                if (ftype == 'mRNA'):
                    for mrna_feature in gene_feature.sub_features:
                        feature_loc = parse_feature_location(
                            refseq_id, refseq_obj_id, mrna_feature.location)
                        sub_feature_list = []

                        gene_obj_id = gene_feature.id
                        if ('gene' in mapid_data
                                and gene_feature.id in mapid_data['gene']):
                            gene_obj_id = mapid_data['gene'][gene_feature.id]

                        for sub_feature in mrna_feature.sub_features:
                            if (sub_feature.type == 'CDS'):
                                sub_feature_loc = parse_feature_location(
                                    refseq_id,
                                    refseq_obj_id,
                                    sub_feature.location,
                                    phase=int(
                                        sub_feature.qualifiers['phase'][0]))
                                sub_feature_list.append({
                                    'name': sub_feature.id,
                                    'type': sub_feature.type,
                                    'loc': sub_feature_loc
                                })
                            else:
                                sub_feature_loc = parse_feature_location(
                                    refseq_id, refseq_obj_id,
                                    sub_feature.location)
                                sub_feature_list.append({
                                    'name': sub_feature.id,
                                    'type': sub_feature.type,
                                    'loc': sub_feature_loc
                                })

                        feature.append({
                            'name': mrna_feature.id,
                            'type': mrna_feature.type,
                            'loc': feature_loc,
                            'sub_features': sub_feature_list,
                            'parent': {
                                '_id': gene_obj_id,
                                'name': gene_feature.id
                            }
                        })
                        #pprint.pprint(feature)
                        #sys.exit()

    fh.close()
    return (feature)
Esempio n. 28
0
def main():
    opt = opt_check(get_optparser())

    gfffile = opt.gfffile

    bedfile = opt.bedfile

    gffprefn = pybedtools.BedTool(gfffile).remove_invalid()

    gffd = dict()

    featurecluster = dict()

    featurenoparent = dict()

    examiner = GFFExaminer()

    outio = open(opt.outfile, 'w')

    openBED = open(bedfile, 'r')

    summitbedoutfile = "summit" + bedfile

    bedoutio = open(summitbedoutfile, 'w')

    for line in openBED:

        if line.startswith("#"):
            continue

        strings = line.strip().split("\t")

        peakchromosome = strings[0]

        peakstart = int(strings[1])

        peakend = int(strings[2])

        mid1 = int((peakstart + peakend) / 2)

        mid2 = int((peakstart + peakend) / 2)

        print(peakchromosome, mid1, mid2, sep="\t", file=bedoutio)

    bedoutio.close()

    derivesfeature = dict()

    for gffinf in gffprefn:

        # a = gffinf.fields

        attrs = gffinf.attrs

        #    continue
        featuretype = gffinf[2]

        if "Parent" not in attrs:
            featurenoparent[featuretype] = 1

        if featuretype in gffd:

            gffd[featuretype] += 1

        else:

            gffd[featuretype] = 1

        if "Derives_from" in attrs:
            derivesfeature[featuretype] = 1
            # print (featuretype)
    #
    # pprint.pprint(featurenoparent)

    updown = list()

    overlap = list()

    skip = list()

    other = list()

    if opt.profile:

        profile = open(opt.profile, 'r')

        for lin in profile.readlines():

            lin = lin.rstrip('\n')

            (typenow, inf) = lin.split(':')

            if typenow == 'updown':
                updown = inf.split(',')

            if typenow == 'overlap':
                overlap = inf.split(',')

            if typenow == 'skip':
                skip = inf.split(',')

            if typenow == 'other':
                other = inf.split(',')

            if typenow == 'makeintron':
                mkintron = inf


    else:

        for featuretype in gffd:

            while True:

                print("#" * 36)

                print("Find ", featuretype, gffd[featuretype], "in genome")

                print("please choose model: \n"
                      "1) calculate up and downstream, \n"
                      "2) overlap, \n"
                      "3) skip, \n"
                      "4) count this type as other")

                if featuretype == 'chromosome':

                    choose = eval(input("suggest 3: ")) or 3

                elif featuretype in derivesfeature:

                    choose = eval(input("suggest 3: ")) or 3

                elif featuretype in featurenoparent:

                    choose = eval(input("suggest 1: ")) or 1

                elif featuretype in ['exon',
                                     'CDS',
                                     'intron',
                                     'five_prime_UTR',
                                     'three_prime_UTR']:

                    choose = eval(input("suggest 2: ")) or 2

                else:

                    choose = eval(input("suggest 2: ")) or 2

                # choose = input()

                choose = int(choose)

                if choose == 1:

                    updown.append(featuretype)

                    print(featuretype, "calculate up and downstream")

                    print()

                    break

                elif choose == 2:

                    overlap.append(featuretype)

                    print(featuretype, "overlap")

                    print()

                    break

                elif choose == 3:

                    skip.append(featuretype)

                    print(featuretype, "skip this featuretype")

                    print()

                    break

                elif choose == 4:

                    other.append(featuretype)

                    print(featuretype, " count this featuretype as other")

                    print()

                    break

                else:

                    print("Please input 1,2,3,4 model")

                    print()

    if not mkintron:

        if 'intron' not in gffd:

            while True:

                print("Do not find intron annotation, suggest make intron annotation. y(es) or n(o)")

                intronyes = eval(input("suggest yes: ")) or 'yes'

                if intronyes == 'yes' or intronyes == 'y':

                    mkintron = True

                    break

                elif intronyes == 'no' or intronyes == 'n':

                    mkintron = False

                    break

                else:

                    continue

    updonwfile = gfffile + "updown"

    updownio = open(updonwfile, 'w')

    gffinio = open(gfffile, 'r')

    for line in gffinio:

        if line.startswith("#"):
            continue

        line = line.rstrip('\n')

        linecontain = line.split("\t")

        if linecontain[2] in updown:
            print(line, file=updownio)

    updownio.close()

    if mkintron:

        gffinio1 = open(gfffile, 'r')

        print("make intron file")

        nointronfile = gfffile + "notinron"

        genefile = gfffile + "gene"

        nointronio = open(nointronfile, 'w')

        geneio = open(genefile, 'w')

        for line in gffinio1:

            if line.startswith("#"):
                continue

            line = line.rstrip('\n')

            linecontain = line.split("\t")

            if linecontain[2] in ['exon',
                                  'CDS',
                                  'five_prime_UTR',
                                  'three_prime_UTR']:
                print(line, file=nointronio)

            if linecontain[2] in ['gene']:
                print(line, file=geneio)

        genefn = pybedtools.BedTool(genefile)

        nointronfn = pybedtools.BedTool(nointronfile)

        intronfn = genefn.subtract(nointronfn)

        intronfn.saveas('tmp_intron.gff')

        # intronfile = gfffile+"intron"

        intronin = open('tmp_intron.gff', 'r')

        # intronout = open (intronfile, 'w')



        gffinio2 = open(gfffile, 'r')

        overlapfile = gfffile + "overlap"

        overlapio = open(overlapfile, 'w')

        for line in gffinio2:

            if line.startswith("#"):
                continue

            line = line.rstrip('\n')

            linecontain = line.split("\t")

            if linecontain[2] in overlap:
                print(line, file=overlapio)

            if linecontain[2] in updown:
                print(line, file=overlapio)

            if linecontain[2] in other:
                print(line, file=overlapio)

        for line in intronin:
            line = line.rstrip('\n')

            b = line.replace('\tgene\t', '\tintron\t')

            print(b, file=overlapio)

        overlapio.close()

        overlap.append('intron')

        # os.remove('tmp_intron.gff')

        os.remove(genefile)

        # os.remove(nointronfile)

    else:

        gffinio2 = open(gfffile, 'r')

        overlapfile = gfffile + "overlap"

        overlapio = open(overlapfile, 'w')

        for line in gffinio2:

            if line.startswith("#"):
                continue

            line = line.rstrip('\n')

            linecontain = line.split("\t")

            if linecontain[2] in overlap:
                print(line, file=overlapio)

            if linecontain[2] in updown:
                print(line, file=overlapio)

            if linecontain[2] in other:
                print(line, file=overlapio)

        overlapio.close()

    print("updown", updown, file=outio)

    print("overlap", overlap, file=outio)

    print("skip", skip, file=outio)

    print("other", other, file=outio)

    overlapfn = pybedtools.BedTool(overlapfile).sort()

    updownfn = pybedtools.BedTool(updonwfile).sort()

    summitfn = pybedtools.BedTool(summitbedoutfile).sort()

    intergenic_summitfn = summitfn.subtract(overlapfn)

    intergenic_summitfn.saveas("intergenic_summitfn.txt")

    nearbyfn = intergenic_summitfn.closest(updownfn, d=True, stream=True)

    # nearbyfn.saveas("nearby.txt")

    d = defaultdict(set)

    bedfields = summitfn.field_count()

    type_idx = bedfields + 2

    bedintersectgff = summitfn.intersect(overlapfn, wao=True)

    for feature in bedintersectgff:

        featuretype = feature[type_idx]

        key = '\t'.join(feature[:bedfields])

        if featuretype in overlap:

            d[key].update([featuretype])
            # print ("overlap")

        elif featuretype in updown:

            d[key].update([featuretype])
            # print ("updown")

        elif featuretype in other:

            d[key].update(['other'])
            # print ("other")

        elif featuretype in skip:
            print(featuretype, "skip")
            # d[key].update(['.'])
            continue

        else:

            continue

            # d[key].update(['.'])

    npeaks = float(len(d))

    count_d = defaultdict(int)

    for peak, featuretypes in list(d.items()):

        if featuretypes == set('.'):

            featuretype = 'unannotated'

            continue

        else:

            featuretype = labelfilter(featuretypes)

        count_d[featuretype] += 1

    results = list(count_d.items())

    # results.sort(key=lambda x: x[1])
    results = sorted(results)
    labels, counts = list(zip(*results))

    labels = []
    counts_to_use = []

    nearpeakd = defaultdict(set)

    for nearpeak in nearbyfn:
        # Chr,peakstart, peakend, genechr,genestart,geneend, genestrand
        # print (nearpeak[0],nearpeak[1], nearpeak[2],nearpeak[bedfields], nearpeak[bedfields+3],nearpeak[bedfields+4],nearpeak[bedfields+6])

        peakkey = '\t'.join(nearpeak[:bedfields])

        if peakkey in d:
            continue

        genestrand = nearpeak[bedfields + 6]

        distance = int(nearpeak[-1])

        typenow = 'error'

        if distance == 0:
            continue

        if int(nearpeak[bedfields + 3]) <= int(nearpeak[1]) <= int(nearpeak[2]) <= int(nearpeak[bedfields + 4]):
            print("error")

            print(nearpeak[0], nearpeak[1], nearpeak[2], nearpeak[bedfields], nearpeak[bedfields + 3],
                  nearpeak[bedfields + 4], nearpeak[bedfields + 6])

        if genestrand == '+':

            if int(nearpeak[1]) >= int(nearpeak[bedfields + 4]):

                # typenow = 'downstrand'
                if distance <= 1000:

                    typenow = nearpeak[bedfields + 2] + "_" + 'TTS_1000'

                elif distance <= 3000:

                    typenow = nearpeak[bedfields + 2] + "_" + 'TTS_3000'

                else:

                    typenow = 'intergentic'

            elif int(nearpeak[2]) <= int(nearpeak[bedfields + 3]):

                if distance <= 1000:

                    typenow = nearpeak[bedfields + 2] + "_" + 'TSS_1000'

                elif distance <= 3000:

                    typenow = nearpeak[bedfields + 2] + "_" + 'TSS_3000'

                else:

                    typenow = 'intergentic'

            else:

                print("error", nearpeak[0], nearpeak[1], nearpeak[2], nearpeak[bedfields], nearpeak[bedfields + 3],
                      nearpeak[bedfields + 4], nearpeak[bedfields + 6])

        elif genestrand == '-':

            if int(nearpeak[1]) >= int(nearpeak[bedfields + 4]):

                # typenow = 'downstrand'
                if distance <= 1000:

                    typenow = nearpeak[bedfields + 2] + "_" + 'TSS_1000'

                elif distance <= 3000:

                    typenow = nearpeak[bedfields + 2] + "_" + 'TSS_3000'

                else:

                    typenow = 'intergentic'

            elif int(nearpeak[2]) <= int(nearpeak[bedfields + 3]):

                if distance <= 1000:

                    typenow = nearpeak[bedfields + 2] + "_" + 'TTS_1000'

                elif distance <= 3000:

                    typenow = nearpeak[bedfields + 2] + "_" + 'TTS_3000'

                else:

                    typenow = 'intergentic'
            else:

                print("error", nearpeak[0], nearpeak[1], nearpeak[2], nearpeak[bedfields], nearpeak[bedfields + 3],
                      nearpeak[bedfields + 4], nearpeak[bedfields + 6])
        else:

            print("error", nearpeak[0], nearpeak[1], nearpeak[2], nearpeak[bedfields], nearpeak[bedfields + 3],
                  nearpeak[bedfields + 4], nearpeak[bedfields + 6])

        nearpeakd[peakkey].update([typenow])

    for peakid in nearpeakd:

        if peakid in d:
            print("error peakid in nearpeakd", peakid, nearpeakd[peakid], d[peakid])

    for peakid in d:

        if peakid in nearpeakd:
            print("error peakid in d", peakid, nearpeakd[peakid], d[peakid])

    discount = defaultdict(int)

    for peak, distypes in list(nearpeakd.items()):
        distype = labelfilter(distypes)

        discount[distype] += 1

    disres = list(discount.items())

    for label, count in results:
        print(label, count, file=outio)

    for label, count in disres:
        print(label, count, file=outio)

    outio.close()