def ccds_parser(self, ccds_file_handle, splice_padding): """Parse a ccds line""" genes = {} exons = {} for line in ccds_file_handle: if not line.startswith('#') and len(line) > 1: line = line.split('\t') chrom = line[0].lstrip('chr') transcript_id = line[1] gene_id = line[2] if is_number(line[7]) and is_number(line[8]): feature_start = int(line[7]) feature_stop = int(line[8]) #TODO raise exception? self.add_gene(genes, chrom, feature_start, feature_stop, gene_id) for interval in line[9][1:-1].split(','): boundaries = (interval.split('-')) exon_start = int(boundaries[0].lstrip()) exon_stop = int(boundaries[1].lstrip()) return genes, exons
def ccds_parser(self, ccds_file_handle, splice_padding): """Parse a ccds line""" genes = {} exons = {} for line in ccds_file_handle: if not line.startswith('#') and len(line) > 1: line = line.split('\t') chrom = line[0].lstrip('chr') transcript_id = line[1] gene_id = line[2] if is_number(line[7]) and is_number(line[8]): feature_start = int(line[7]) feature_stop = int(line[8]) #TODO raise exception? self.add_gene( genes, chrom, feature_start, feature_stop, gene_id ) for interval in line[9][1:-1].split(','): boundaries = (interval.split('-')) exon_start = int(boundaries[0].lstrip()) exon_stop = int(boundaries[1].lstrip()) return genes, exons
def gff_parser(self, gff_file_handle, splice_padding, id_key='ID'): """Parse a gtf file""" genes = {} exons = {} for line in gff_file_handle: line = line.rstrip() if not line.startswith('#') and len(line) > 1: if line.startswith('>'): break transcript_id = '' gene_id = '' gene_name = '' feature_start = 0 feature_stop = 0 line = line.split('\t') if len(line) < 5: line = line.split() chrom = line[0].lstrip('chr') if is_number(line[3]) and is_number(line[4]): feature_start = int(line[3]) feature_stop = int(line[4]) #TODO Raise exception? for information in line[8].split(';'): entry = information.split('=') if entry[0] == 'transcript_id': transcript_id = entry[1][1:-1] if entry[0] == 'gene_id': gene_id = entry[1][1:-1] if entry[0] == 'gene_name': gene_name = entry[1][1:-1] if entry[0] == id_key: gene_id = entry[1] if line[2] in ['gene', 'CDS']: self.add_gene( genes, chrom, feature_start, feature_stop, gene_id ) elif line[2] == 'exon': self.add_exon( exons, chrom, feature_start, feature_stop, transcript_id, splice_padding ) return genes,exons
def ccds_parser(ccds_file_handle, splice_padding=2): """ Parser a file in the ccds format. Arguments: ccds_file_handle (file_handle): An opened file in ccds format splice_padding (int): An integer that describes how we should expand the exons Returns: genes (dict): A dictionary chromosome ids as keys and gene dictionarys as values. exons (dict): A dictionary chromosome ids as keys and exon dictionarys as values. """ genes = {} exons = {} for line in ccds_file_handle: if not line.startswith('#') and len(line) > 1: line = line.split('\t') chrom = line[0].lstrip('chr') transcript_id = line[1] gene_id = line[2] if chrom not in genes: genes[chrom] = {} if chrom not in exons: exons[chrom] = {} if is_number(line[7]) and is_number(line[8]): feature_start = int(line[7]) feature_stop = int(line[8]) #TODO raise exception? genes[chrom][gene_id] = get_coordinates( genes[chrom], feature_start, feature_stop, gene_id ) for interval in line[9][1:-1].split(','): boundaries = (interval.split('-')) exon_start = int(boundaries[0].lstrip()) exon_stop = int(boundaries[1].lstrip()) exon_id = str(exon_start)+str(exon_stop) if not exon_id in exons[chrom]: exons[chrom][exon_id] = [ exon_start - splice_padding, exon_stop + splice_padding, exon_id ] return genes, exons
def gtf_parser(self, gtf_file_handle, splice_padding): """Parse a gtf file""" genes = {} exons = {} for line in gtf_file_handle: if not line.startswith('#') and len(line) > 1: transcript_id = '' gene_id = '' gene_name = '' line = line.split('\t') if len(line) < 5: line = line.split() chrom = line[0].lstrip('chr') if is_number(line[3]) and is_number(line[4]): feature_start = int(line[3]) feature_stop = int(line[4]) #TODO Raise exception? info_field = line[8].split(';')[:-1] for information in info_field: entry = information.split() if entry[0] == 'transcript_id': transcript_id = entry[1][1:-1] if entry[0] == 'gene_id': gene_id = entry[1][1:-1] if entry[0] == 'gene_name': gene_name = entry[1][1:-1] if line[2] == 'gene': self.add_gene( genes, chrom, feature_start, feature_stop, gene_id ) elif line[2] == 'exon': self.add_exon( exons, chrom, feature_start, feature_stop, transcript_id, splice_padding ) return genes,exons
def gff_parser(self, gff_file_handle, splice_padding, id_key='ID'): """Parse a gtf file""" genes = {} exons = {} for line in gff_file_handle: line = line.rstrip() if not line.startswith('#') and len(line) > 1: if line.startswith('>'): break transcript_id = '' gene_id = '' gene_name = '' feature_start = 0 feature_stop = 0 line = line.split('\t') if len(line) < 5: line = line.split() chrom = line[0].lstrip('chr') if is_number(line[3]) and is_number(line[4]): feature_start = int(line[3]) feature_stop = int(line[4]) #TODO Raise exception? for information in line[8].split(';'): entry = information.split('=') if entry[0] == 'transcript_id': transcript_id = entry[1][1:-1] if entry[0] == 'gene_id': gene_id = entry[1][1:-1] if entry[0] == 'gene_name': gene_name = entry[1][1:-1] if entry[0] == id_key: gene_id = entry[1] if line[2] in ['gene', 'CDS']: self.add_gene(genes, chrom, feature_start, feature_stop, gene_id) elif line[2] == 'exon': self.add_exon(exons, chrom, feature_start, feature_stop, transcript_id, splice_padding) return genes, exons
def gtf_parser(self, gtf_file_handle, splice_padding): """Parse a gtf file""" genes = {} exons = {} for line in gtf_file_handle: if not line.startswith('#') and len(line) > 1: transcript_id = '' gene_id = '' gene_name = '' line = line.split('\t') if len(line) < 5: line = line.split() chrom = line[0].lstrip('chr') if is_number(line[3]) and is_number(line[4]): feature_start = int(line[3]) feature_stop = int(line[4]) #TODO Raise exception? info_field = line[8].split(';')[:-1] for information in info_field: entry = information.split() if entry[0] == 'transcript_id': transcript_id = entry[1][1:-1] if entry[0] == 'gene_id': gene_id = entry[1][1:-1] if entry[0] == 'gene_name': gene_name = entry[1][1:-1] if line[2] == 'gene': self.add_gene(genes, chrom, feature_start, feature_stop, gene_id) elif line[2] == 'exon': self.add_exon(exons, chrom, feature_start, feature_stop, transcript_id, splice_padding) return genes, exons
def test_int(): """Test if is_number behave as suspected""" obj = 2 assert is_number(obj) == True
def test_str_float(): """Test if is_number behave as suspected""" obj = '1.3' assert is_number(obj) == True
def test_non_number(): """Test if is_number behave as suspected""" obj = 'a' assert is_number(obj) == False
def test_float(): """Test if is_number behave as suspected""" obj = 2.5 assert is_number(obj) == True
def gtf_parser(gtf_file_handle, splice_padding=2): """ Parse a file in the gtf format. Arguments: gtf_file_handle (file_handle): An opened file in gtf file format splice_padding (int): An integer that describes how we should expand the exons Returns: genes (dict): A dictionary chromosome ids as keys and gene dictionarys as values. exons (dict): A dictionary chromosome ids as keys and exon dictionarys as values. """ genes = {} exons = {} for line in gtf_file_handle: if not line.startswith('#') and len(line) > 1: transcript_id = '' gene_id = '' gene_name = '' line = line.split('\t') if len(line) < 5: line = line.split() chrom = line[0].lstrip('chr') feature_type = line[2] if is_number(line[3]) and is_number(line[4]): feature_start = int(line[3]) feature_stop = int(line[4]) info_field = line[8].split(';')[:-1] for information in info_field: entry = information.split() if entry[0] == 'transcript_id': transcript_id = entry[1].strip('"') if entry[0] == 'gene_id': gene_id = entry[1].strip('"') if entry[0] == 'gene_name': gene_name = entry[1].strip('"') if feature_type == 'gene': if chrom not in genes: genes[chrom] = {} genes[chrom][gene_id] = get_coordinates( genes[chrom], feature_start, feature_stop, gene_id ) elif feature_type == 'exon': if chrom not in exons: exons[chrom] = {} exon_id = str(feature_start) + str(feature_stop) exons[chrom][exon_id] = [ feature_start - splice_padding, feature_stop + splice_padding, exon_id ] return genes,exons
def gff_parser(gff_file_handle, splice_padding=2, id_key='ID'): """ Parse a file in the gff format. Arguments: gff_file_handle (file_handle): An opened file in gff file format splice_padding (int): An integer that describes how we should expand the exons id_key (str): The key that defines the gene id in the info string Returns: genes (dict): A dictionary chromosome ids as keys and gene dictionarys as values. exons (dict): A dictionary chromosome ids as keys and exon dictionarys as values. """ genes = {} exons = {} for line in gff_file_handle: line = line.rstrip() if not line.startswith('#') and len(line) > 1: if line.startswith('>'): break transcript_id = '' gene_id = '' gene_name = '' feature_start = 0 feature_stop = 0 line = line.split('\t') if len(line) < 5: line = line.split() chrom = line[0].lstrip('chr') feature_type = line[2] if is_number(line[3]) and is_number(line[4]): feature_start = int(line[3]) feature_stop = int(line[4]) #TODO Raise exception? for information in line[8].split(';'): entry = information.split('=') if entry[0] == 'transcript_id': transcript_id = entry[1][1:-1] if entry[0] == 'gene_id': gene_id = entry[1][1:-1] if entry[0] == 'gene_name': gene_name = entry[1][1:-1] if entry[0] == id_key: gene_id = entry[1] if feature_type in ['gene', 'CDS', 'exon']: if chrom not in genes: genes[chrom] = {} if chrom not in exons: exons[chrom] = {} if feature_type in ['gene', 'CDS']: genes[chrom][gene_id] = get_coordinates( genes[chrom], feature_start, feature_stop, gene_id ) elif line[2] == 'exon': exon_id = str(feature_start) + str(feature_stop) exon_feature = [ feature_start - splice_padding, feature_stop + splice_padding, exon_id ] print(exon_feature) exons[chrom][exon_id] = exon_feature return genes,exons