def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False): """Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries. *Keyword arguments:* - file_name -- The gencode .gtf file name. """ # Opening GTF file try: gtf_file = open(file_name, "r") except Exception: print("Error: Cannot find the annotation file: " + file_name) print("Please check the path in ~/rgtdata/data.config") sys.exit(1) # Reading GTF file for line in gtf_file: # Processing line line = line.strip() if line[0] == "#": continue line_list = line.split("\t") try: if filter_havana and line_list[1] == "HAVANA": continue except: pass addt_list = line_list[8].split(";") addt_list = [_f for _f in addt_list if _f] # Processing additional list of options addt_dict = dict() for addt_element in addt_list: addt_element_list = addt_element.split(" ") addt_element_list = [_f for _f in addt_element_list if _f] # Removing " symbol from string options addt_element_list[1] = addt_element_list[1].replace("\"", "") addt_dict[addt_element_list[0]] = addt_element_list[1] # filter non-protein-coding sequences, if required if protein_coding: if "gene_type" not in addt_dict or addt_dict[ "gene_type"] != "protein_coding": continue if "transcript_type" in addt_dict and addt_dict[ "transcript_type"] != "protein_coding": continue # filter unknown sequences, if required if known_only: if "gene_status" not in addt_dict or addt_dict[ "gene_status"] != "KNOWN": continue if "transcript_status" in addt_dict and addt_dict[ "transcript_status"] != "KNOWN": continue # Removing dot from IDs addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0] try: addt_dict["transcript_id"] = addt_dict["transcript_id"].split( ".")[0] except: pass # Creating final version of additional arguments final_addt_list = [] for addt_key in [ "gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name", "level" ]: try: final_addt_list.append(addt_dict[addt_key]) except Exception: final_addt_list.append(None) # Handling score current_score = 0 if AuxiliaryFunctions.string_is_int(line_list[5]): current_score = AuxiliaryFunctions.correct_standard_bed_score( line_list[5]) # Creating GenomicRegion genomic_region = GenomicRegion(chrom=line_list[0], initial=int(line_list[3]) - 1, final=int(line_list[4]), orientation=line_list[6], data=current_score) # Creating final vector extra_index_elements = [ [], [] ] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES final_vector = [ genomic_region, line_list[1], line_list[2], line_list[7] ] + final_addt_list + extra_index_elements self.gene_list.append(final_vector) # Termination gtf_file.close()
def load_gene_list(self, file_name, filter_havana=True): """ Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries. Keyword arguments: file_name -- The gencode .gtf file name. Return: void. """ # Opening GTF file try: gtf_file = open(file_name, "r") except Exception: pass # TODO # Reading GTF file for line in gtf_file: # Processing line line = line.strip() if (line[0] == "#"): continue line_list = line.split("\t") if (filter_havana and line_list[1] == "HAVANA"): continue addt_list = line_list[8].split(";") addt_list = filter(None, addt_list) # Processing additional list of options addt_dict = dict() for addt_element in addt_list: addt_element_list = addt_element.split(" ") addt_element_list = filter(None, addt_element_list) addt_element_list[1] = addt_element_list[1].replace( "\"", "") # Removing " symbol from string options addt_dict[addt_element_list[0]] = addt_element_list[1] # Removing dot from IDs addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0] addt_dict["transcript_id"] = addt_dict["transcript_id"].split( ".")[0] # Creating final version of additional arguments final_addt_list = [] for addt_key in [ "gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name", "level" ]: try: final_addt_list.append(addt_dict[addt_key]) except Exception: final_addt_list.append(None) # Handling score current_score = 0 if (AuxiliaryFunctions.string_is_int(line_list[5])): current_score = AuxiliaryFunctions.correct_standard_bed_score( line_list[5]) # Creating GenomicRegion genomic_region = GenomicRegion(chrom=line_list[0], initial=int(line_list[3]) - 1, final=int(line_list[4]), orientation=line_list[6], data=current_score) # Creating final vector extra_index_elements = [ [], [] ] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES final_vector = [ genomic_region, line_list[1], line_list[2], line_list[7] ] + final_addt_list + extra_index_elements self.gene_list.append(final_vector) # Termination gtf_file.close()
def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False): """Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries. *Keyword arguments:* - file_name -- The gencode .gtf file name. """ # Opening GTF file try: gtf_file = open(file_name, "r") except Exception: print("Error: Cannot find the annotation file: "+file_name) print("Please check the path in ~/rgtdata/data.config") sys.exit(1) # Reading GTF file for line in gtf_file: # Processing line line = line.strip() if line[0] == "#": continue line_list = line.split("\t") try: if filter_havana and line_list[1] == "HAVANA": continue except: pass addt_list = line_list[8].split(";") addt_list = filter(None, addt_list) # Processing additional list of options addt_dict = dict() for addt_element in addt_list: addt_element_list = addt_element.split(" ") addt_element_list = filter(None, addt_element_list) # Removing " symbol from string options addt_element_list[1] = addt_element_list[1].replace("\"", "") addt_dict[addt_element_list[0]] = addt_element_list[1] # filter non-protein-coding sequences, if required if protein_coding: if "gene_type" not in addt_dict or addt_dict["gene_type"] != "protein_coding": continue if "transcript_type" in addt_dict and addt_dict["transcript_type"] != "protein_coding": continue # filter unknown sequences, if required if known_only: if "gene_status" not in addt_dict or addt_dict["gene_status"] != "KNOWN": continue if "transcript_status" in addt_dict and addt_dict["transcript_status"] != "KNOWN": continue # Removing dot from IDs addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0] try: addt_dict["transcript_id"] = addt_dict["transcript_id"].split(".")[0] except: pass # Creating final version of additional arguments final_addt_list = [] for addt_key in ["gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name", "level"]: try: final_addt_list.append(addt_dict[addt_key]) except Exception: final_addt_list.append(None) # Handling score current_score = 0 if AuxiliaryFunctions.string_is_int(line_list[5]): current_score = AuxiliaryFunctions.correct_standard_bed_score(line_list[5]) # Creating GenomicRegion genomic_region = GenomicRegion(chrom=line_list[0], initial=int(line_list[3])-1, final=int(line_list[4]), orientation=line_list[6], data=current_score) # Creating final vector extra_index_elements = [[],[]] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES final_vector = [genomic_region,line_list[1],line_list[2],line_list[7]] + final_addt_list + extra_index_elements self.gene_list.append(final_vector) # Termination gtf_file.close()
def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False): """ Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries. Keyword arguments: file_name -- The gencode .gtf file name. Return: void. """ # Opening GTF file try: gtf_file = open(file_name,"r") except Exception: pass # TODO # Reading GTF file for line in gtf_file: # Processing line line = line.strip() if(line[0] == "#"): continue line_list = line.split("\t") if(filter_havana and line_list[1] == "HAVANA"): continue addt_list = line_list[8].split(";") if(protein_coding and "protein_coding" not in addt_list[2] ): continue if(known_only and "KNOWN" not in addt_list[3] ): continue if(protein_coding and "protein_coding" not in addt_list[5] ): continue if(known_only and "KNOWN" not in addt_list[6] ): continue addt_list = filter(None,addt_list) # Processing additional list of options addt_dict = dict() for addt_element in addt_list: addt_element_list = addt_element.split(" ") addt_element_list = filter(None,addt_element_list) addt_element_list[1] = addt_element_list[1].replace("\"","") # Removing " symbol from string options addt_dict[addt_element_list[0]] = addt_element_list[1] # Removing dot from IDs addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0] addt_dict["transcript_id"] = addt_dict["transcript_id"].split(".")[0] # Creating final version of additional arguments final_addt_list = [] for addt_key in ["gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name", "level"]: try: final_addt_list.append(addt_dict[addt_key]) except Exception: final_addt_list.append(None) # Handling score current_score = 0 if(AuxiliaryFunctions.string_is_int(line_list[5])): current_score = AuxiliaryFunctions.correct_standard_bed_score(line_list[5]) # Creating GenomicRegion genomic_region = GenomicRegion(chrom = line_list[0], initial = int(line_list[3])-1, final = int(line_list[4]), orientation = line_list[6], data = current_score) # Creating final vector extra_index_elements = [[],[]] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES final_vector = [genomic_region,line_list[1],line_list[2],line_list[7]] + final_addt_list + extra_index_elements self.gene_list.append(final_vector) # Termination gtf_file.close()