Example #1
0
    def ccds_parser(self, ccds_file_handle, splice_padding):
        """Parse a ccds line"""
        genes = {}
        exons = {}

        for line in ccds_file_handle:
            if not line.startswith('#') and len(line) > 1:
                line = line.split('\t')
                chrom = line[0].lstrip('chr')
                transcript_id = line[1]
                gene_id = line[2]
                if is_number(line[7]) and is_number(line[8]):
                    feature_start = int(line[7])
                    feature_stop = int(line[8])
                    #TODO raise exception?

                    self.add_gene(genes, chrom, feature_start, feature_stop,
                                  gene_id)

                    for interval in line[9][1:-1].split(','):
                        boundaries = (interval.split('-'))
                        exon_start = int(boundaries[0].lstrip())
                        exon_stop = int(boundaries[1].lstrip())

        return genes, exons
Example #2
0
 def ccds_parser(self, ccds_file_handle, splice_padding):
     """Parse a ccds line"""
     genes = {}
     exons = {}
     
     for line in ccds_file_handle:
         if not line.startswith('#') and len(line) > 1:
             line = line.split('\t')                
             chrom = line[0].lstrip('chr')
             transcript_id = line[1]
             gene_id = line[2]
             if is_number(line[7]) and is_number(line[8]):
                 feature_start = int(line[7])
                 feature_stop = int(line[8])
                 #TODO raise exception?
             
                 self.add_gene(
                             genes, 
                             chrom, 
                             feature_start, 
                             feature_stop, 
                             gene_id
                             )
             
                 for interval in line[9][1:-1].split(','):
                     boundaries = (interval.split('-'))
                     exon_start = int(boundaries[0].lstrip())
                     exon_stop = int(boundaries[1].lstrip())
     
     return genes, exons
Example #3
0
    def gff_parser(self, gff_file_handle, splice_padding, id_key='ID'):
        """Parse a gtf file"""
        genes = {}
        exons = {}
        
        for line in gff_file_handle:
            line = line.rstrip()
            if not line.startswith('#') and len(line) > 1:
                if line.startswith('>'):
                    break
                transcript_id = ''
                gene_id = ''
                gene_name = ''
                feature_start = 0
                feature_stop = 0

                line = line.split('\t')
                if len(line) < 5:
                    line = line.split()
                chrom = line[0].lstrip('chr')

                if is_number(line[3]) and is_number(line[4]):
                    feature_start = int(line[3])
                    feature_stop = int(line[4])
                #TODO Raise exception?

                for information in line[8].split(';'):
                    entry = information.split('=')
                    if entry[0] == 'transcript_id':
                        transcript_id = entry[1][1:-1]
                    if entry[0] == 'gene_id':
                        gene_id = entry[1][1:-1]
                    if entry[0] == 'gene_name':
                        gene_name = entry[1][1:-1]
                    if entry[0] == id_key:
                        gene_id = entry[1]
                
                if line[2] in ['gene', 'CDS']:
                    self.add_gene(
                                genes,
                                chrom,
                                feature_start,
                                feature_stop,
                                gene_id
                                )

                elif line[2] == 'exon':
                    self.add_exon(
                                exons,
                                chrom,
                                feature_start,
                                feature_stop,
                                transcript_id,
                                splice_padding
                                )


        return genes,exons
Example #4
0
def ccds_parser(ccds_file_handle, splice_padding=2):
    """
    Parser a file in the ccds format.
    
    Arguments:
        ccds_file_handle (file_handle): An opened file in ccds format
        splice_padding (int): An integer that describes how we should expand 
                              the exons
    Returns:
        genes (dict): A dictionary chromosome ids as keys and gene dictionarys
                      as values.
        exons (dict): A dictionary chromosome ids as keys and exon dictionarys
                      as values.
    """
    genes = {}
    exons = {}
    
    for line in ccds_file_handle:
        
        if not line.startswith('#') and len(line) > 1:
            
            line = line.split('\t')
            chrom = line[0].lstrip('chr')
            transcript_id = line[1]
            gene_id = line[2]
            
            if chrom not in genes:
                genes[chrom] = {}
            
            if chrom not in exons:
                exons[chrom] = {}
            
            if is_number(line[7]) and is_number(line[8]):
                feature_start = int(line[7])
                feature_stop = int(line[8])
                #TODO raise exception?
                
                genes[chrom][gene_id] = get_coordinates(
                    genes[chrom],
                    feature_start,
                    feature_stop,
                    gene_id
                )
                
                for interval in line[9][1:-1].split(','):
                    boundaries = (interval.split('-'))
                    exon_start = int(boundaries[0].lstrip())
                    exon_stop = int(boundaries[1].lstrip())
                    exon_id = str(exon_start)+str(exon_stop)
                    
                    if not exon_id in exons[chrom]:
                        exons[chrom][exon_id] = [
                            exon_start - splice_padding, 
                            exon_stop + splice_padding, 
                            exon_id
                        ]
    return genes, exons
Example #5
0
 def gtf_parser(self, gtf_file_handle, splice_padding):
     """Parse a gtf file"""
     genes = {}
     exons = {}
     
     for line in gtf_file_handle:
         if not line.startswith('#') and len(line) > 1:
             transcript_id = ''
             gene_id = ''
             gene_name = ''
             
             line = line.split('\t')
             if len(line) < 5:
                 line = line.split()
             chrom = line[0].lstrip('chr')
             if is_number(line[3]) and is_number(line[4]):
                 feature_start = int(line[3])
                 feature_stop = int(line[4])
             #TODO Raise exception?
             info_field = line[8].split(';')[:-1]
             for information in info_field:
                 entry = information.split()
                 if entry[0] == 'transcript_id':
                     transcript_id = entry[1][1:-1]
                 if entry[0] == 'gene_id':
                     gene_id = entry[1][1:-1]
                 if entry[0] == 'gene_name':
                     gene_name = entry[1][1:-1]
             
             if line[2] == 'gene':
                 self.add_gene(
                             genes, 
                             chrom, 
                             feature_start, 
                             feature_stop, 
                             gene_id
                             )
              
             elif line[2] == 'exon':
                 self.add_exon(
                             exons, 
                             chrom, 
                             feature_start, 
                             feature_stop, 
                             transcript_id, 
                             splice_padding
                             )
             
     
     return genes,exons
Example #6
0
    def gff_parser(self, gff_file_handle, splice_padding, id_key='ID'):
        """Parse a gtf file"""
        genes = {}
        exons = {}

        for line in gff_file_handle:
            line = line.rstrip()
            if not line.startswith('#') and len(line) > 1:
                if line.startswith('>'):
                    break
                transcript_id = ''
                gene_id = ''
                gene_name = ''
                feature_start = 0
                feature_stop = 0

                line = line.split('\t')
                if len(line) < 5:
                    line = line.split()
                chrom = line[0].lstrip('chr')

                if is_number(line[3]) and is_number(line[4]):
                    feature_start = int(line[3])
                    feature_stop = int(line[4])
                #TODO Raise exception?

                for information in line[8].split(';'):
                    entry = information.split('=')
                    if entry[0] == 'transcript_id':
                        transcript_id = entry[1][1:-1]
                    if entry[0] == 'gene_id':
                        gene_id = entry[1][1:-1]
                    if entry[0] == 'gene_name':
                        gene_name = entry[1][1:-1]
                    if entry[0] == id_key:
                        gene_id = entry[1]

                if line[2] in ['gene', 'CDS']:
                    self.add_gene(genes, chrom, feature_start, feature_stop,
                                  gene_id)

                elif line[2] == 'exon':
                    self.add_exon(exons, chrom, feature_start, feature_stop,
                                  transcript_id, splice_padding)

        return genes, exons
Example #7
0
    def gtf_parser(self, gtf_file_handle, splice_padding):
        """Parse a gtf file"""
        genes = {}
        exons = {}

        for line in gtf_file_handle:
            if not line.startswith('#') and len(line) > 1:
                transcript_id = ''
                gene_id = ''
                gene_name = ''

                line = line.split('\t')
                if len(line) < 5:
                    line = line.split()
                chrom = line[0].lstrip('chr')
                if is_number(line[3]) and is_number(line[4]):
                    feature_start = int(line[3])
                    feature_stop = int(line[4])
                #TODO Raise exception?
                info_field = line[8].split(';')[:-1]
                for information in info_field:
                    entry = information.split()
                    if entry[0] == 'transcript_id':
                        transcript_id = entry[1][1:-1]
                    if entry[0] == 'gene_id':
                        gene_id = entry[1][1:-1]
                    if entry[0] == 'gene_name':
                        gene_name = entry[1][1:-1]

                if line[2] == 'gene':
                    self.add_gene(genes, chrom, feature_start, feature_stop,
                                  gene_id)

                elif line[2] == 'exon':
                    self.add_exon(exons, chrom, feature_start, feature_stop,
                                  transcript_id, splice_padding)

        return genes, exons
Example #8
0
def test_int():
    """Test if is_number behave as suspected"""
    obj = 2
    assert is_number(obj) == True
Example #9
0
def test_str_float():
    """Test if is_number behave as suspected"""
    obj = '1.3'
    assert is_number(obj) == True
Example #10
0
def test_non_number():
    """Test if is_number behave as suspected"""
    obj = 'a'
    assert is_number(obj) == False
Example #11
0
def test_float():
    """Test if is_number behave as suspected"""
    obj = 2.5
    assert is_number(obj) == True
Example #12
0
def test_str_float():
    """Test if is_number behave as suspected"""
    obj = '1.3'
    assert is_number(obj) == True
Example #13
0
def test_non_number():
    """Test if is_number behave as suspected"""
    obj = 'a'
    assert is_number(obj) == False
Example #14
0
def gtf_parser(gtf_file_handle, splice_padding=2):
   """
   Parse a file in the gtf format.
   
   Arguments:
       gtf_file_handle (file_handle): An opened file in gtf file format
       splice_padding (int): An integer that describes how we should expand 
                             the exons
   
   Returns:
       genes (dict): A dictionary chromosome ids as keys and gene dictionarys
                     as values.
       exons (dict): A dictionary chromosome ids as keys and exon dictionarys
                     as values.
   
   """
   genes = {}
   exons = {}

   for line in gtf_file_handle:
       if not line.startswith('#') and len(line) > 1:
           transcript_id = ''
           gene_id = ''
           gene_name = ''

           line = line.split('\t')

           if len(line) < 5:
               line = line.split()

           chrom = line[0].lstrip('chr')
           feature_type = line[2]
           
           if is_number(line[3]) and is_number(line[4]):
               feature_start = int(line[3])
               feature_stop = int(line[4])

           info_field = line[8].split(';')[:-1]

           for information in info_field:
               entry = information.split()

               if entry[0] == 'transcript_id':
                   transcript_id = entry[1].strip('"')
               if entry[0] == 'gene_id':
                   gene_id = entry[1].strip('"')
               if entry[0] == 'gene_name':
                   gene_name = entry[1].strip('"')

           if feature_type == 'gene':
               
               if chrom not in genes:
                   genes[chrom] = {}
               
               genes[chrom][gene_id] = get_coordinates(
                           genes[chrom],
                           feature_start,
                           feature_stop,
                           gene_id
                           )

           elif feature_type == 'exon':
               
               if chrom not in exons:
                   exons[chrom] = {}
               
               exon_id = str(feature_start) + str(feature_stop)
               exons[chrom][exon_id] = [
                   feature_start - splice_padding,
                   feature_stop + splice_padding,
                   exon_id
                   ]
               

   return genes,exons
Example #15
0
def gff_parser(gff_file_handle, splice_padding=2, id_key='ID'):
    """
    Parse a file in the gff format.
    
    Arguments:
        gff_file_handle (file_handle): An opened file in gff file format
        splice_padding (int): An integer that describes how we should expand 
                              the exons
        id_key (str): The key that defines the gene id in the info string
    
    Returns:
        genes (dict): A dictionary chromosome ids as keys and gene dictionarys
                      as values.
        exons (dict): A dictionary chromosome ids as keys and exon dictionarys
                      as values.
    """
    genes = {}
    exons = {}

    for line in gff_file_handle:
        line = line.rstrip()
        if not line.startswith('#') and len(line) > 1:
            if line.startswith('>'):
                break
            transcript_id = ''
            gene_id = ''
            gene_name = ''
            feature_start = 0
            feature_stop = 0
            
            line = line.split('\t')
            if len(line) < 5:
                line = line.split()
            chrom = line[0].lstrip('chr')
            
            feature_type = line[2]

            if is_number(line[3]) and is_number(line[4]):
                feature_start = int(line[3])
                feature_stop = int(line[4])
            #TODO Raise exception?

            for information in line[8].split(';'):
                entry = information.split('=')
                if entry[0] == 'transcript_id':
                    transcript_id = entry[1][1:-1]
                if entry[0] == 'gene_id':
                    gene_id = entry[1][1:-1]
                if entry[0] == 'gene_name':
                    gene_name = entry[1][1:-1]
                if entry[0] == id_key:
                    gene_id = entry[1]
            
            if feature_type in ['gene', 'CDS', 'exon']:
                if chrom not in genes:
                    genes[chrom] = {}
                if chrom not in exons:
                    exons[chrom] = {}

            if feature_type in ['gene', 'CDS']:
                genes[chrom][gene_id] = get_coordinates(
                            genes[chrom],
                            feature_start,
                            feature_stop,
                            gene_id
                            )

            elif line[2] == 'exon':
                exon_id = str(feature_start) + str(feature_stop)
                exon_feature = [
                    feature_start - splice_padding,
                    feature_stop + splice_padding,
                    exon_id
                ]
                print(exon_feature)
                exons[chrom][exon_id] = exon_feature


    return genes,exons