Esempio n. 1
0
    def get_annotated_cds(cls, protein_sequence_handler, seq_region,
                          protein_id, cds_list):

        cds_strand = cds_list[0]['cds_strand']
        protein_id = cds_list[0]['protein_id']
        (stable_id, stable_id_version) = protein_id.split(".")

        (translation_start,
         translation_end) = cls.get_translation_loc(cds_list)
        translation = {}
        translation['assembly_id'] = cls.ASSEMBLY_ID
        translation['assembly_name'] = cls.ASSEMBLY_NAME
        translation['stable_id'] = stable_id
        translation['stable_id_version'] = stable_id_version
        translation['loc_start'] = translation_start
        translation['loc_end'] = translation_end
        translation['loc_strand'] = cds_strand
        translation['loc_region'] = seq_region
        translation[
            'translation_seq'] = protein_sequence_handler.get_fasta_seq_by_id(
                protein_id)
        translation['seq_checksum'] = ChecksumHandler.get_seq_checksum(
            translation, 'translation_seq')
        translation['session_id'] = None
        translation['loc_checksum'] = ChecksumHandler.get_location_checksum(
            translation)

        translation[
            'translation_checksum'] = ChecksumHandler.get_translation_checksum(
                translation)

        return translation
Esempio n. 2
0
 def get_annotated_transcript(cls, sequence_handler, chrom, mRNA_feature):
     transcript = {}
     # Note we have shifted one base here
     transcript['assembly_id'] = cls.ASSEMBLY_ID
     transcript['assembly_name'] = cls.ASSEMBLY_NAME
     transcript['loc_start'] = str(mRNA_feature.location.start + 1)
     transcript['loc_end'] = str(mRNA_feature.location.end)
     transcript['loc_strand'] = str(mRNA_feature.location.strand)
     transcript['loc_region'] = str(chrom)
     stable_id = mRNA_feature.qualifiers['transcript_id'][0]
     (transcript_stable_id, transcript_stable_id_version) = stable_id.split(".")
     transcript['stable_id'] = transcript_stable_id
     transcript['stable_id_version'] = transcript_stable_id_version
     # populate biotype for transcript
     if transcript_stable_id[0:3] == "NM_":
         transcript['biotype'] = "protein_coding"
     elif transcript_stable_id[0:3] == "NR_":
         transcript['biotype'] = "non_protein_coding"
     elif transcript_stable_id[0:3] == "XM_":
         transcript['biotype'] = "predicted_protein_coding"
     elif transcript_stable_id[0:3] == "XR_":
         transcript['biotype'] = "predicted_non_protein_coding"
     else:
         transcript['biotype'] = None       
     transcript['session_id'] = None
     transcript['transcript_checksum'] = None
     transcript['exon_set_checksum'] = None
     transcript['loc_checksum'] = ChecksumHandler.get_location_checksum(transcript)
     transcript['sequence'] = sequence_handler.get_sequence_by_id(mRNA_feature.qualifiers['transcript_id'][0])
     transcript['seq_checksum'] = ChecksumHandler.get_seq_checksum(transcript, 'sequence')
     return transcript
    def load_release_set(self, assembly_id, session_id, data_release_set=None):
        if data_release_set is None:
            today = datetime.now().date()
            default_config = ConfigHandler().getInstance().get_section_config()
            data_release_set = collections.OrderedDict()
            data_release_set["shortname"] = default_config["shortname"]
            data_release_set["description"] = default_config["description"]
            data_release_set["assembly_id"] = str(assembly_id)
            data_release_set["release_date"] = str(today)
            data_release_set["session_id"] = str(session_id)
            data_release_set["source_id"] = default_config["source"]

        release_set_checksum = ChecksumHandler.checksum_list(
            list(data_release_set.values()))
        data_release_set["release_checksum"] = release_set_checksum

        insert_release_set = (
            "INSERT INTO release_set (shortname, description, assembly_id, release_date, session_id, \
                                release_checksum, source_id) VALUES \
                                (%(shortname)s,  %(description)s, %(assembly_id)s, %(release_date)s,  %(session_id)s, \
                                X%(release_checksum)s, %(source_id)s)\
                                ON DUPLICATE KEY UPDATE release_id=LAST_INSERT_ID(release_id)"
        )

        release_id = self.insert_data(insert_release_set, data_release_set)
        return release_id
Esempio n. 4
0
    def get_annotated_exon(cls, seq_region, exon_feature, exon_sequence):
        exon = {}
        exon['assembly_id'] = cls.ASSEMBLY_ID
        exon['assembly_name'] = cls.ASSEMBLY_NAME
        exon['loc_start'] = exon_feature["exon_start"]
        exon['loc_end'] = exon_feature["exon_end"]
        exon['loc_strand'] = exon_feature["exon_strand"]
        exon['loc_region'] = str(seq_region)
        exon['loc_checksum'] = ChecksumHandler.get_location_checksum(exon)
        exon['exon_order'] = exon_feature["exon_order"]
        exon['stable_id'] = exon_feature["exon_stable_id"]
        exon['stable_id_version'] = exon_feature["exon_stable_id_version"]
        exon['session_id'] = None
        exon['exon_seq'] = exon_sequence
        exon['seq_checksum'] = ChecksumHandler.get_seq_checksum(exon, 'exon_seq')
        exon['exon_checksum'] = ChecksumHandler.get_exon_checksum(exon)

        return exon
Esempio n. 5
0
    def get_annotated_gene(cls, chrom, gene_feature):
        gene = {}
        gene['loc_start'] = str(gene_feature.location.start)
        gene['loc_end'] = str(gene_feature.location.end)
        gene['loc_strand'] = str(gene_feature.location.strand)
        gene['loc_region'] = str(chrom)
        gene['stable_id'] = cls.parse_qualifiers(gene_feature.qualifiers,
                                                 "Dbxref", "GeneID")
        gene['stable_id_version'] = 1
        gene['assembly_id'] = cls.ASSEMBLY_ID
        gene['assembly_name'] = cls.ASSEMBLY_NAME
        # make it none for the moment, otherwise you will get integrity exception
        hgnc_id = cls.parse_qualifiers(gene_feature.qualifiers, "Dbxref",
                                       "HGNC:HGNC")
        if hgnc_id is not None:
            hgnc_id = "HGNC:" + hgnc_id
        gene['hgnc_id'] = hgnc_id

        gene['session_id'] = None
        gene['loc_checksum'] = ChecksumHandler.get_location_checksum(gene)
        gene['gene_checksum'] = ChecksumHandler.get_gene_checksum(gene)
        return gene
Esempio n. 6
0
 def get_annotated_transcript(cls, sequence_handler, chrom, mRNA_feature):
     transcript = {}
     # Note we have shifted one base here
     transcript['assembly_id'] = cls.ASSEMBLY_ID
     transcript['assembly_name'] = cls.ASSEMBLY_NAME
     transcript['loc_start'] = str(mRNA_feature.location.start + 1)
     transcript['loc_end'] = str(mRNA_feature.location.end)
     transcript['loc_strand'] = str(mRNA_feature.location.strand)
     transcript['loc_region'] = str(chrom)
     stable_id = mRNA_feature.qualifiers['transcript_id'][0]
     (transcript_stable_id,
      transcript_stable_id_version) = stable_id.split(".")
     transcript['stable_id'] = transcript_stable_id
     transcript['stable_id_version'] = transcript_stable_id_version
     transcript['session_id'] = None
     transcript['transcript_checksum'] = None
     transcript['exon_set_checksum'] = None
     transcript['loc_checksum'] = ChecksumHandler.get_location_checksum(
         transcript)
     transcript['sequence'] = sequence_handler.get_sequence_by_id(
         mRNA_feature.qualifiers['transcript_id'][0])
     transcript['seq_checksum'] = ChecksumHandler.get_seq_checksum(
         transcript, 'sequence')
     return transcript
    def run(self):

        mydb_config = ConfigHandler().getInstance().get_section_config(section_name="DATABASE")
        dbh = DatabaseHandler(db_config=mydb_config,
                              mypool_name="mypool_" + str(self.seq_region))
        dbc = dbh.get_connection()

        sequence_handler = FastaHandler(self.downloaded_files['fasta'])

        print("Loading protein.....")
        print(self.downloaded_files['protein'])
        protein_sequence_handler = FastaHandler(self.downloaded_files['protein'])

        print("Working on Seq region limit " + str(self.seq_region))

        gff_handle = open(self.downloaded_files['gff'])

        # Chromosome seq level
        for rec in GFF.parse(gff_handle, limit_info=self.limits, target_lines=1000):

            for gene_feature in rec.features:

                # skip regions
                if gene_feature.type == "region":
                    continue

                annotated_gene = AnnotationHandler.get_annotated_gene(self.seq_region, gene_feature)

                # gene level
                annotated_transcripts = []
                for mRNA_feature in gene_feature.sub_features:

                    if 'transcript_id' in mRNA_feature.qualifiers:
                        transcript_id = mRNA_feature.qualifiers['transcript_id'][0]
                    else:
                        continue

                    refseq_exon_list = []
                    refseq_exon_order = 1

                    refseq_cds_list = []
                    refseq_cds_order = 1
                    for mRNA_sub_feature in mRNA_feature.sub_features:
                        refseq_exon_dict = {}
                        if 'exon' in mRNA_sub_feature.type:
                            # print("Transcript Has exons" + str(mRNA_sub_feature.id))
                            refseq_exon_dict['exon_stable_id'] = str(mRNA_sub_feature.id)
                            refseq_exon_dict['exon_stable_id_version'] = 1  # dummmy version
                            refseq_exon_dict['exon_order'] = refseq_exon_order
                            # note that we are shifting one base here
                            refseq_exon_dict['exon_start'] = str(mRNA_sub_feature.location.start + 1)
                            refseq_exon_dict['exon_end'] = str(mRNA_sub_feature.location.end)
                            refseq_exon_dict['exon_strand'] = str(mRNA_sub_feature.location.strand)
                            refseq_exon_list.append(refseq_exon_dict)
                            refseq_exon_order += 1

                        refseq_cds_dict = {}
                        if 'CDS' in mRNA_sub_feature.type:

                            refseq_cds_dict['cds_order'] = refseq_cds_order
                            # note that we are shifting one base here
                            refseq_cds_dict['cds_start'] = str(mRNA_sub_feature.location.start + 1)
                            refseq_cds_dict['cds_end'] = str(mRNA_sub_feature.location.end)
                            refseq_cds_dict['cds_strand'] = str(mRNA_sub_feature.location.strand)
                            refseq_cds_dict['cds_id'] = str(mRNA_sub_feature.id)
                            refseq_cds_dict['protein_id'] = str(mRNA_sub_feature.qualifiers['protein_id'][0])  # @IgnorePep8
                            refseq_cds_list.append(refseq_cds_dict)
                            refseq_cds_order += 1

                    annotated_transcript = AnnotationHandler.get_annotated_transcript(sequence_handler,
                                                                                      self.seq_region,
                                                                                      mRNA_feature)

                    # add sequence and other annotations
                    annotated_exons = []
                    if len(refseq_exon_list) > 0:
                        annotated_exons = AnnotationHandler.get_annotated_exons(sequence_handler, self.seq_region,
                                                                                transcript_id,
                                                                                refseq_exon_list)

                        if annotated_exons is not None and len(annotated_exons) > 0:

                            exon_set_checksum = ChecksumHandler.get_exon_set_checksum(annotated_exons)
                            annotated_transcript['exon_set_checksum'] = exon_set_checksum
                            annotated_transcript['exons'] = annotated_exons
                        else:
                            annotated_transcript['exons'] = []

                    annotated_translation = []
                    if len(refseq_cds_list) > 0:
                        protein_id = refseq_cds_list[0]['protein_id']
                        annotated_translation = AnnotationHandler.get_annotated_cds(protein_sequence_handler,
                                                                                    self.seq_region,
                                                                                    protein_id,
                                                                                    refseq_cds_list)
                        annotated_transcript['translation'] = annotated_translation
                    else:
                        annotated_transcript['translation'] = []

                    annotated_transcript['transcript_checksum'] = ChecksumHandler.get_transcript_checksum(annotated_transcript)  # @IgnorePep8
                    annotated_transcripts.append(annotated_transcript)

                annotated_gene['transcripts'] = annotated_transcripts
                feature_object_to_save = {}
                feature_object_to_save["gene"] = annotated_gene

                if not self.dryrun and annotated_gene is not None and annotated_gene['stable_id'] is not None:
                    print("About to load gene => " + str(annotated_gene['stable_id']))
                    feature_handler = FeatureHandler(parent_ids=self.parent_ids, dbc=dbc)
                    feature_handler.save_features_to_database(feature_object_to_save)

        dbc.close()
        gff_handle.close()

        print("About to write to the status file")
        status_dir = self.download_dir + '/' + 'status_logs'
        if not os.path.exists(status_dir):
            os.makedirs(status_dir)
        self.status_file = status_dir + '/' + 'status_file_chr' + str(self.seq_region)
        status_handle = open(self.status_file, "w")
        status_handle.write("Done")
        status_handle.close()