def load_release_set(self, assembly_id, session_id, data_release_set=None):
        if data_release_set is None:
            today = datetime.now().date()
            default_config = ConfigHandler().getInstance().get_section_config()
            data_release_set = collections.OrderedDict()
            data_release_set["shortname"] = default_config["shortname"]
            data_release_set["description"] = default_config["description"]
            data_release_set["assembly_id"] = str(assembly_id)
            data_release_set["release_date"] = str(today)
            data_release_set["session_id"] = str(session_id)
            data_release_set["source_id"] = default_config["source"]

        release_set_checksum = ChecksumHandler.checksum_list(
            list(data_release_set.values()))
        data_release_set["release_checksum"] = release_set_checksum

        insert_release_set = (
            "INSERT INTO release_set (shortname, description, assembly_id, release_date, session_id, \
                                release_checksum, source_id) VALUES \
                                (%(shortname)s,  %(description)s, %(assembly_id)s, %(release_date)s,  %(session_id)s, \
                                X%(release_checksum)s, %(source_id)s)\
                                ON DUPLICATE KEY UPDATE release_id=LAST_INSERT_ID(release_id)"
        )

        release_id = self.insert_data(insert_release_set, data_release_set)
        return release_id
    def __init__(self, db_config=None, mypool_name="mypool"):

        if db_config is None:
            db_config = ConfigHandler().getInstance().get_section_config(
                section_name="DATABASE")

        logger.info("loading in to  " + db_config.get("database"))

        mydbconfig = {
            "user": db_config.get("user"),
            "password": db_config.get("pass"),
            "port": db_config.get("port"),
            "host": db_config.get("host"),
            "database": db_config.get("database")
        }

        connection_pool = mysql.connector.pooling.MySQLConnectionPool(
            pool_name=mypool_name,
            pool_size=32,
            pool_reset_session=True,
            **mydbconfig)
        print(connection_pool)
        connection_obj = connection_pool.get_connection()

        self.db_con = connection_obj
    def populate_parent_tables(self, init_table_list=None):

        if init_table_list is None:
            init_table_list = [
                "session", "genome", "assembly", "assembly_alias",
                "release_source"
            ]

        session_id = None
        genome_id = None
        assembly_id = None

        parent_ids = {}
        if "session" in init_table_list:
            session_id = self.start_session("Refseq Client " +
                                            str(time.time()))
            parent_ids['session_id'] = session_id
            print(".........Popultating SESSION table.........\n")

        if "genome" in init_table_list:
            genome_data = {
                "name": "homo_sapiens",
                "tax_id": str(9606),
                "session_id": str(session_id)
            }
            genome_id = self.load_genome(genome_data)
            parent_ids['genome_id'] = genome_id
            print(".........Popultating GENOME table.........\n")

        if "assembly" in init_table_list:
            assembly_data = {
                "genome_id": str(genome_id),
                "assembly_name": "GRCh38",
                "session_id": str(session_id)
            }
            assembly_id = self.load_assembly(assembly_data)
            parent_ids['assembly_id'] = assembly_id
            logger.info(".........Popultating ASSEMBLY table.........\n")

        if "assembly_alias" in init_table_list:
            assembly_alias_data = {
                "alias": "GCA_000001405.25",
                "genome_id": str(genome_id),
                "assembly_id": str(assembly_id),
                "session_id": str(session_id)
            }
            assembly_alias_id = self.load_assembly_alias(assembly_alias_data)
            parent_ids['assembly_alias_id'] = assembly_alias_id
            logger.info(".........Popultating ASSEMBLY ALIAS table.........\n")

        if "release_source" in init_table_list:
            release_source = {
                "shortname": "Ensembl",
                "description": "Ensembl data imports from Human Core DBs"
            }
            release_source_ensembl = self.load_release_source(release_source)
            parent_ids['release_source_ensembl'] = release_source_ensembl
            logger.info(".........Popultating RELEASE SOURCE table.........\n")

            release_source = {
                "shortname": "RefSeq",
                "description": "RefSeq data imports from RefSeq GFF"
            }
            release_source_refseq = self.load_release_source(release_source)
            parent_ids['release_source_refseq'] = release_source_refseq
            logger.info(".........Popultating REFSEQ table.........\n")

        # load data_release_set
        today = datetime.now().date()
        default_config = ConfigHandler().getInstance().get_section_config()
        data_release_set = collections.OrderedDict()
        data_release_set["shortname"] = default_config["shortname"]
        data_release_set["description"] = default_config["description"]
        data_release_set["assembly_id"] = str(assembly_id)
        data_release_set["release_date"] = str(today)
        data_release_set["session_id"] = str(session_id)
        data_release_set["source_id"] = str(release_source_refseq)
        release_set_id = self.load_release_set(assembly_id, session_id,
                                               data_release_set)
        parent_ids['release_id'] = release_set_id

        return parent_ids
Exemple #4
0
class AnnotationHandler(object):

    ASSEMBLY_ID = ConfigHandler().getInstance().get_section_config(
    )["assembly_id"]
    ASSEMBLY_NAME = ConfigHandler().getInstance().get_section_config(
    )["assembly_name"]

    @classmethod
    def get_annotated_gene(cls, chrom, gene_feature):
        gene = {}
        gene['loc_start'] = str(gene_feature.location.start)
        gene['loc_end'] = str(gene_feature.location.end)
        gene['loc_strand'] = str(gene_feature.location.strand)
        gene['loc_region'] = str(chrom)
        gene['stable_id'] = cls.parse_qualifiers(gene_feature.qualifiers,
                                                 "Dbxref", "GeneID")
        gene['stable_id_version'] = 1
        gene['assembly_id'] = cls.ASSEMBLY_ID
        gene['assembly_name'] = cls.ASSEMBLY_NAME
        # make it none for the moment, otherwise you will get integrity exception
        hgnc_id = cls.parse_qualifiers(gene_feature.qualifiers, "Dbxref",
                                       "HGNC:HGNC")
        if hgnc_id is not None:
            hgnc_id = "HGNC:" + hgnc_id
        gene['hgnc_id'] = hgnc_id

        gene['session_id'] = None
        gene['loc_checksum'] = ChecksumHandler.get_location_checksum(gene)
        gene['gene_checksum'] = ChecksumHandler.get_gene_checksum(gene)
        return gene

    @classmethod
    def get_annotated_transcript(cls, sequence_handler, chrom, mRNA_feature):
        transcript = {}
        # Note we have shifted one base here
        transcript['assembly_id'] = cls.ASSEMBLY_ID
        transcript['assembly_name'] = cls.ASSEMBLY_NAME
        transcript['loc_start'] = str(mRNA_feature.location.start + 1)
        transcript['loc_end'] = str(mRNA_feature.location.end)
        transcript['loc_strand'] = str(mRNA_feature.location.strand)
        transcript['loc_region'] = str(chrom)
        stable_id = mRNA_feature.qualifiers['transcript_id'][0]
        (transcript_stable_id,
         transcript_stable_id_version) = stable_id.split(".")
        transcript['stable_id'] = transcript_stable_id
        transcript['stable_id_version'] = transcript_stable_id_version
        transcript['session_id'] = None
        transcript['transcript_checksum'] = None
        transcript['exon_set_checksum'] = None
        transcript['loc_checksum'] = ChecksumHandler.get_location_checksum(
            transcript)
        transcript['sequence'] = sequence_handler.get_sequence_by_id(
            mRNA_feature.qualifiers['transcript_id'][0])
        transcript['seq_checksum'] = ChecksumHandler.get_seq_checksum(
            transcript, 'sequence')
        return transcript

    @classmethod
    def get_annotated_exons(cls, sequence_handler, seq_region,
                            transcript_identifier, refseq_exon_list):
        exon_sequences = []

        refseq_exon_list_relative_coordinates = ExonUtils.compute_exon_coordinates(
            refseq_exon_list.copy())
        #         '''
        #         Ref: BioSeqFeature
        #         Note that the start and end location numbering follow Python's scheme,
        #         thus a GenBank entry of 123..150 (one based counting) becomes a location
        #         of [122:150] (zero based counting).
        #         '''

        for exon in refseq_exon_list_relative_coordinates:
            sequence = sequence_handler.get_seq_record_by_id_location(
                transcript_identifier, exon['exon_start'], exon['exon_end'],
                int(exon['exon_strand']))
            exon_sequences.append(str(sequence))
        # exon_sequences = sequence_handler.get_exon_sequences_by_identifier(transcript_identifier)
        # print(exon_sequences)
        annotated_exons = []

        if exon_sequences is None:
            return None

        if len(refseq_exon_list) != len(exon_sequences):
            return None

        for exon_feature, exon_sequence in zip(refseq_exon_list,
                                               exon_sequences):
            annotated_exons.append(
                cls.get_annotated_exon(seq_region, exon_feature,
                                       exon_sequence))

        return annotated_exons

    @classmethod
    def get_annotated_exon(cls, seq_region, exon_feature, exon_sequence):
        exon = {}
        exon['assembly_id'] = cls.ASSEMBLY_ID
        exon['assembly_name'] = cls.ASSEMBLY_NAME
        exon['loc_start'] = exon_feature["exon_start"]
        exon['loc_end'] = exon_feature["exon_end"]
        exon['loc_strand'] = exon_feature["exon_strand"]
        exon['loc_region'] = str(seq_region)
        exon['loc_checksum'] = ChecksumHandler.get_location_checksum(exon)
        exon['exon_order'] = exon_feature["exon_order"]
        exon['stable_id'] = exon_feature["exon_stable_id"]
        exon['stable_id_version'] = exon_feature["exon_stable_id_version"]
        exon['session_id'] = None
        exon['exon_seq'] = exon_sequence
        exon['seq_checksum'] = ChecksumHandler.get_seq_checksum(
            exon, 'exon_seq')
        exon['exon_checksum'] = ChecksumHandler.get_exon_checksum(exon)

        return exon

    @classmethod
    def get_annotated_cds(cls, protein_sequence_handler, seq_region,
                          protein_id, cds_list):

        cds_strand = cds_list[0]['cds_strand']
        protein_id = cds_list[0]['protein_id']
        (stable_id, stable_id_version) = protein_id.split(".")

        (translation_start,
         translation_end) = cls.get_translation_loc(cds_list)
        translation = {}
        translation['assembly_id'] = cls.ASSEMBLY_ID
        translation['assembly_name'] = cls.ASSEMBLY_NAME
        translation['stable_id'] = stable_id
        translation['stable_id_version'] = stable_id_version
        translation['loc_start'] = translation_start
        translation['loc_end'] = translation_end
        translation['loc_strand'] = cds_strand
        translation['loc_region'] = seq_region
        translation[
            'translation_seq'] = protein_sequence_handler.get_fasta_seq_by_id(
                protein_id)
        translation['seq_checksum'] = ChecksumHandler.get_seq_checksum(
            translation, 'translation_seq')
        translation['session_id'] = None
        translation['loc_checksum'] = ChecksumHandler.get_location_checksum(
            translation)

        translation[
            'translation_checksum'] = ChecksumHandler.get_translation_checksum(
                translation)

        return translation

    @classmethod
    def get_translation_loc(cls, cds_list):
        cds = cds_list[0]
        if (cds['cds_strand'] == '1'):
            cds_start = [
                cds['cds_start'] for cds in cds_list if cds['cds_order'] == 1
            ]
            cds_end = [
                cds['cds_end'] for cds in cds_list
                if cds['cds_order'] == len(cds_list)
            ]
        elif (cds['cds_strand'] == '-1'):
            cds_start = [
                cds['cds_start'] for cds in cds_list
                if cds['cds_order'] == len(cds_list)
            ]
            cds_end = [
                cds['cds_end'] for cds in cds_list if cds['cds_order'] == 1
            ]

        if len(cds_start) > 0 and len(cds_end) > 0:
            return (cds_start[0], cds_end[0])
        else:
            return (0, 0)

    @classmethod
    def parse_qualifiers(cls, qualifiers, key_qualifier, attr=None):
        if key_qualifier in qualifiers:
            cur_qualifiers = qualifiers[key_qualifier]
            for cur_qualifier in cur_qualifiers:
                if attr is not None:
                    my_regex = attr + ":" + "(.*)"
                    matchObj = re.match(my_regex, cur_qualifier,
                                        re.M | re.I)  # @IgnorePep8
                    if matchObj and matchObj.group(1):
                        attr_value = matchObj.group(1)
                        return str(attr_value)
        return None

    @classmethod
    def get_seq_region_from_refseq_accession(cls, refseq_accession):
        matchObj = re.match(r'NC_(\d+)\.\d+', refseq_accession,
                            re.M | re.I)  # @IgnorePep8

        if matchObj and matchObj.group(1):
            chrom = int(matchObj.group(1))
            if chrom == 23:
                return "X"
            elif chrom == 24:
                return "Y"
            else:
                return chrom

    @classmethod
    def add_feature_sequence(cls,
                             fasta_handler,
                             feature_locations,
                             feature_id,
                             feature_type='exon'):
        features_with_seq = []
        for feature in feature_locations:
            feature_seq = fasta_handler.get_fasta_seq_by_id(
                feature_id, feature[feature_type + '_start'],
                feature[feature_type + '_end'])
            feature[feature_type + '_seq'] = feature_seq
            features_with_seq.append(feature)
        return features_with_seq
    def run(self):

        mydb_config = ConfigHandler().getInstance().get_section_config(section_name="DATABASE")
        dbh = DatabaseHandler(db_config=mydb_config,
                              mypool_name="mypool_" + str(self.seq_region))
        dbc = dbh.get_connection()

        sequence_handler = FastaHandler(self.downloaded_files['fasta'])

        print("Loading protein.....")
        print(self.downloaded_files['protein'])
        protein_sequence_handler = FastaHandler(self.downloaded_files['protein'])

        print("Working on Seq region limit " + str(self.seq_region))

        gff_handle = open(self.downloaded_files['gff'])

        # Chromosome seq level
        for rec in GFF.parse(gff_handle, limit_info=self.limits, target_lines=1000):

            for gene_feature in rec.features:

                # skip regions
                if gene_feature.type == "region":
                    continue

                annotated_gene = AnnotationHandler.get_annotated_gene(self.seq_region, gene_feature)

                # gene level
                annotated_transcripts = []
                for mRNA_feature in gene_feature.sub_features:

                    if 'transcript_id' in mRNA_feature.qualifiers:
                        transcript_id = mRNA_feature.qualifiers['transcript_id'][0]
                    else:
                        continue

                    refseq_exon_list = []
                    refseq_exon_order = 1

                    refseq_cds_list = []
                    refseq_cds_order = 1
                    for mRNA_sub_feature in mRNA_feature.sub_features:
                        refseq_exon_dict = {}
                        if 'exon' in mRNA_sub_feature.type:
                            # print("Transcript Has exons" + str(mRNA_sub_feature.id))
                            refseq_exon_dict['exon_stable_id'] = str(mRNA_sub_feature.id)
                            refseq_exon_dict['exon_stable_id_version'] = 1  # dummmy version
                            refseq_exon_dict['exon_order'] = refseq_exon_order
                            # note that we are shifting one base here
                            refseq_exon_dict['exon_start'] = str(mRNA_sub_feature.location.start + 1)
                            refseq_exon_dict['exon_end'] = str(mRNA_sub_feature.location.end)
                            refseq_exon_dict['exon_strand'] = str(mRNA_sub_feature.location.strand)
                            refseq_exon_list.append(refseq_exon_dict)
                            refseq_exon_order += 1

                        refseq_cds_dict = {}
                        if 'CDS' in mRNA_sub_feature.type:

                            refseq_cds_dict['cds_order'] = refseq_cds_order
                            # note that we are shifting one base here
                            refseq_cds_dict['cds_start'] = str(mRNA_sub_feature.location.start + 1)
                            refseq_cds_dict['cds_end'] = str(mRNA_sub_feature.location.end)
                            refseq_cds_dict['cds_strand'] = str(mRNA_sub_feature.location.strand)
                            refseq_cds_dict['cds_id'] = str(mRNA_sub_feature.id)
                            refseq_cds_dict['protein_id'] = str(mRNA_sub_feature.qualifiers['protein_id'][0])  # @IgnorePep8
                            refseq_cds_list.append(refseq_cds_dict)
                            refseq_cds_order += 1

                    annotated_transcript = AnnotationHandler.get_annotated_transcript(sequence_handler,
                                                                                      self.seq_region,
                                                                                      mRNA_feature)

                    # add sequence and other annotations
                    annotated_exons = []
                    if len(refseq_exon_list) > 0:
                        annotated_exons = AnnotationHandler.get_annotated_exons(sequence_handler, self.seq_region,
                                                                                transcript_id,
                                                                                refseq_exon_list)

                        if annotated_exons is not None and len(annotated_exons) > 0:

                            exon_set_checksum = ChecksumHandler.get_exon_set_checksum(annotated_exons)
                            annotated_transcript['exon_set_checksum'] = exon_set_checksum
                            annotated_transcript['exons'] = annotated_exons
                        else:
                            annotated_transcript['exons'] = []

                    annotated_translation = []
                    if len(refseq_cds_list) > 0:
                        protein_id = refseq_cds_list[0]['protein_id']
                        annotated_translation = AnnotationHandler.get_annotated_cds(protein_sequence_handler,
                                                                                    self.seq_region,
                                                                                    protein_id,
                                                                                    refseq_cds_list)
                        annotated_transcript['translation'] = annotated_translation
                    else:
                        annotated_transcript['translation'] = []

                    annotated_transcript['transcript_checksum'] = ChecksumHandler.get_transcript_checksum(annotated_transcript)  # @IgnorePep8
                    annotated_transcripts.append(annotated_transcript)

                annotated_gene['transcripts'] = annotated_transcripts
                feature_object_to_save = {}
                feature_object_to_save["gene"] = annotated_gene

                if not self.dryrun and annotated_gene is not None and annotated_gene['stable_id'] is not None:
                    print("About to load gene => " + str(annotated_gene['stable_id']))
                    feature_handler = FeatureHandler(parent_ids=self.parent_ids, dbc=dbc)
                    feature_handler.save_features_to_database(feature_object_to_save)

        dbc.close()
        gff_handle.close()

        print("About to write to the status file")
        status_dir = self.download_dir + '/' + 'status_logs'
        if not os.path.exists(status_dir):
            os.makedirs(status_dir)
        self.status_file = status_dir + '/' + 'status_file_chr' + str(self.seq_region)
        status_handle = open(self.status_file, "w")
        status_handle.write("Done")
        status_handle.close()
    def requires(self):
        mydefault_config = ConfigHandler().getInstance().get_section_config(section_name="DEFAULT")

        (gff_filename, gff_file_extension) = os.path.splitext(mydefault_config['gff_file'])  # @UnusedVariable
        (fasta_filename, fasta_file_extension) = os.path.splitext(mydefault_config['fasta_file'])  # @UnusedVariable
        (protein_filename, protein_file_extension) = os.path.splitext(mydefault_config['protein_file'])  # @UnusedVariable

        downloaded_files = {}
        downloaded_files['gff'] = self.download_dir + "/" + gff_filename
        downloaded_files['fasta'] = self.download_dir + "/" + fasta_filename
        downloaded_files['protein'] = self.download_dir + "/" + protein_filename

        # Examine for available regions
        # examiner = GFF.GFFExaminer()

        # load the parent tables
        parent_ids = None
        # use for debugging only

        if not self.dryrun:
            mydb_config = ConfigHandler().getInstance().get_section_config(section_name="DATABASE")
            dbh = DatabaseHandler(db_config=mydb_config,
                                  mypool_name="mypool_parentids")
            print(dbh)
            feature_handler = FeatureHandler(dbc=dbh.get_connection())
            parent_ids = feature_handler.populate_parent_tables()

        print(downloaded_files['gff'])

        # You could examine the file to get the possible chr, initialising it to save some time
        #         with open(downloaded_files['gff']) as gff_handle_examiner:
        #             possible_limits = examiner.available_limits(gff_handle_examiner)
        #             chromosomes = sorted(possible_limits["gff_id"].keys())
        chromosomes = [
            ('NC_000001.11',),
            ('NC_000002.12',),
            ('NC_000003.12',),
            ('NC_000004.12',),
            ('NC_000005.10',),
            ('NC_000006.12',),
            ('NC_000007.14',),
            ('NC_000008.11',),
            ('NC_000009.12',),
            ('NC_000010.11',),
            ('NC_000011.10',),
            ('NC_000012.12',),
            ('NC_000013.11',),
            ('NC_000014.9',),
            ('NC_000015.10',),
            ('NC_000016.10',),
            ('NC_000017.11',),
            ('NC_000018.10',),
            ('NC_000019.10',),
            ('NC_000020.11',),
            ('NC_000021.9',),
            ('NC_000022.11',),
            ('NC_000023.11',),
            ('NC_000024.10',),
            ('NC_012920.1',)
            ]
        limits = dict()
        # for testing
        filter_regions = None
        for chrom_tuple in chromosomes:
            chrom = chrom_tuple[0]
            if not chrom.startswith("NC_"):
                continue
            print(chrom_tuple)

            seq_region = self.get_seq_region_from_refseq_accession(chrom)

            # Restrict only for filter_region
            if self.limit_chr is not None:
                if ',' in self.limit_chr:
                    filter_regions = self.limit_chr.split(',')
                else:
                    filter_regions = [self.limit_chr]

                if str(seq_region) not in filter_regions:
                    print(" Skipping " + str(seq_region))
                    continue

            limits["gff_id"] = chrom_tuple

            yield ParseRecord(
                   download_dir=self.download_dir,
                   downloaded_files=downloaded_files,
                   seq_region=str(seq_region),
                   parent_ids=parent_ids,
                   limits=limits,
                   dryrun=self.dryrun
                )
class DownloadRefSeqSourceFiles(luigi.WrapperTask):
    """
    Wrapper Task to download refseq gff files
    """

    download_dir = luigi.Parameter()
    task_namespace = 'DownloadRefSeqSourceFiles'

    assembly_id = ConfigHandler().getInstance().get_section_config()["assembly_id"]
    assembly_name = ConfigHandler().getInstance().get_section_config()["assembly_name"]
    source = ConfigHandler().getInstance().get_section_config()["source"]
    shortname = ConfigHandler().getInstance().get_section_config()["shortname"]
    description = ConfigHandler().getInstance().get_section_config()["description"]
    print("Assembly ID " + str(assembly_id))
    print("Assembly Name " + str(assembly_name))
    print("source name " + str(source))
    print("shortname " + str(shortname))
    print("description  " + str(description))

    ftp_root = ConfigHandler().getInstance().get_section_config()["ftp_root"]
    gff_file = ConfigHandler().getInstance().get_section_config()["gff_file"]
    fasta_file = ConfigHandler().getInstance().get_section_config()["fasta_file"]
    protein_file = ConfigHandler().getInstance().get_section_config()["protein_file"]

    print("ftp_root {}".format(ftp_root))
    print("gff_file {}".format(gff_file))
    print("fasta_file {}".format(fasta_file))
    print("protein_file {}".format(protein_file))

#     ftp_root = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.38_GRCh38.p12/'
#     #ftp_root = 'http://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.38_GRCh38.p12'  # @IgnorePep8
#     gff_file = 'GCF_000001405.38_GRCh38.p12_genomic.gff.gz'
#     fasta_file = 'GCF_000001405.38_GRCh38.p12_rna.fna.gz'
#     protein_file = 'GCF_000001405.38_GRCh38.p12_protein.faa.gz'

    files_to_download = [gff_file, fasta_file, protein_file]
    # files_to_download = [gff_file]

    def complete(self):
        complete_list = []
        for file_ in self.files_to_download:
            base = os.path.basename(file_)
            downloaded_file_url_zipped = self.download_dir + '/' + file_
            downloaded_file_url_unzipped = self.download_dir + '/' + os.path.splitext(base)[0]

            if os.path.exists(downloaded_file_url_zipped) and os.path.exists(downloaded_file_url_unzipped):
                complete_list.append(True)

        if len(complete_list) == len(self.files_to_download):
            return True
        else:
            return False

    def requires(self):
        for file_ in self.files_to_download:
            yield DownloadRefSeqSourceFile(
                download_dir=self.download_dir,
                file_to_download=file_,
                ftp_root=self.ftp_root)

            yield UnzipRefSeqFile(
                download_dir=self.download_dir,
                file_to_download=file_,
                ftp_root=self.ftp_root
                     )