def get_interpro_scan_last_row_id(db_config): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_1 = "SELECT MAX(interpro_scan_ID) AS LAST_ID FROM InterProScan" row_protein_instance_feature = get_max_table_value(db_dots, sql_1) return row_protein_instance_feature
def get_gene_name_map(db_config, taxonomy_id, org_version): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_query = """select p.name as 'name1',naf.name as 'name2', gi.gene_instance_id from Protein p, GeneInstance gi, NAFeatureImp naf, NASequenceImp na where gi.gene_instance_id = p.gene_instance_id and naf.na_feature_id = gi.na_feature_id and naf.feature_type='mRNA' and na.na_sequence_id = naf.na_sequence_id and na.taxon_id = %s and na.sequence_version = %s""" % (taxonomy_id, org_version) gene_name_dct = {} result = db_dots.query(sql_query) for i, value in enumerate(result): name1 = value['name1'] name2 = value['name2'] gene_instance_id = value['gene_instance_id'] modified_gene_name = modify_gene_name(name1) gene_name_dct[modified_gene_name] = gene_instance_id gene_name_dct[name2] = gene_instance_id return gene_name_dct
def get_sam_alignment_last_row_id(db_config): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_1 = "SELECT MAX(SAM_ALIGNMENT_ID) AS LAST_ID FROM SamAlignment" row_sam_alignment = get_max_table_value(db_dots, sql_1) return row_sam_alignment
def upload_protein_feature_table_data(db_config, upload_dir_names): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 1) pfam_upload_file = upload_dir_names.PFam signalp_upload_file = upload_dir_names.SignalP tmhmm_upload_file = upload_dir_names.TmHmm # For HmmPfam table sql_1 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE HmmPfam FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`PFAM_ID`, `GENE_INSTANCE_ID`, `E_VALUE`, `SCORE`, `BIAS`, `ACCESSION_ID`, `DOMAIN_NAME`, `DOMAIN_DESCRIPTION`) """ % pfam_upload_file db_dots.insert(sql_1) # signalp table sql_2 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE SignalP FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n'""" % signalp_upload_file db_dots.insert(sql_2) # For Tmhmm table sql_3 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE Tmhmm FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`TMHMM_ID`, `GENE_INSTANCE_ID`, `INSIDE`, `OUTSIDE`, `TMHELIX`)""" % tmhmm_upload_file db_dots.insert(sql_3)
def create_sequence_file_for_gal_using_dct(db_config, path): """ This function checks and creates fasta file for sam It returns the fasta file path associated with its taxonomy id and version """ db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_query = "SELECT * FROM NASequenceImp WHERE SEQUENCE_TYPE_ID =1 order by taxon_id, sequence_version" result = db_dots.query(sql_query) # collect data into a dict dct = defaultdict(lambda: defaultdict(lambda: defaultdict())) for i, value in enumerate(result): dct[value['TAXON_ID']][value['SEQUENCE_VERSION']][ value['NA_SEQUENCE_ID']] = value['SEQUENCE'] file_list = [] # write the data into the file for taxonomy_id, version_dct in dct.items(): for version, sequence_dct in version_dct.items(): filename = "{}_{}.fasta".format(taxonomy_id, version) file_path = os.path.join(path, filename) file_list.append(filename) with open(file_path, 'w') as fh: for na_sequence_id, sequence in sequence_dct.items(): sequence_string = ">{}_{}_{}\n{}\n".format( na_sequence_id, taxonomy_id, version, sequence) fh.write(sequence_string) return file_list
def get_organism_hierarchy_map(db_config): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_query = "SELECT * FROM Organism" result = db_dots.query(sql_query) hierarchy_dct = defaultdict(lambda: defaultdict()) for i, value in enumerate(result): key = "{}_{}.fasta".format(value['TAXON_ID'], value['VERSION']) hierarchy_dct[key]['GENUS'] = value['GENUS'] hierarchy_dct[key]['PHYLUM'] = value['PHYLUM'] if value['FAMILY'] == 'None': hierarchy_dct[key]['FAMILY'] = None else: hierarchy_dct[key]['FAMILY'] = value['FAMILY'] if value['ORDERS'] == 'None': hierarchy_dct[key]['ORDERS'] = None else: hierarchy_dct[key]['ORDERS'] = value['ORDERS'] if value['CLASS'] == 'None': hierarchy_dct[key]['CLASS'] = None else: hierarchy_dct[key]['CLASS'] = value['CLASS'] return hierarchy_dct
def upload_interpro_data(db_config, interpro_data): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 1) # For proteininstancefeature table sql_1 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE InterProScan FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % interpro_data # print(sql_1) db_dots.insert(sql_1)
def upload_sam_data(db_config, sam_data): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 1) # For NASequenceImp table sql_1 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE SamAlignment FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % sam_data print(sql_1) db_dots.insert(sql_1)
def __init__(self, db_config, content_dir): DefaultSchemaPath.__init__(self, content_dir) self.db = db_function.DatabaseCreate(db_config.host, db_config.db_username, db_config.db_password) self.db_name = db_function.DbNames(db_config.db_prefix) self.core = self.db.create(self.db_name.core) self.dots = self.db.create(self.db_name.dots) self.shared_resource = self.db.create(self.db_name.sres) self.db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, self.db_name.dots, 0) self.db_shared_resource = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, self.db_name.sres, 0)
def get_protein_feature_table_status(db_config): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_1 = "SELECT MAX(PFAM_ID) as LAST_ID FROM HmmPfam" sql_2 = "SELECT MAX(TMHMM_ID) as LAST_ID FROM Tmhmm" sql_3 = "SELECT MAX(SIGNALP_ID) as LAST_ID FROM SignalP" row_hmm_pfam = get_max_table_value(db_dots, sql_1) row_tmhmm = get_max_table_value(db_dots, sql_2) row_signalp = get_max_table_value(db_dots, sql_3) row_list = [row_hmm_pfam, row_tmhmm, row_signalp] return row_list
def find_uploaded_sam_files(db_config, path): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_query = "SELECT query_taxon_id, query_organism_version, target_taxon_id, target_organism_version " \ "FROM SamAlignment " \ "group by query_taxon_id, query_organism_version, target_taxon_id, target_organism_version" result = db_dots.query(sql_query) file_list = [] for i, value in enumerate(result): query_filename = "{}_{}.fasta".format(value['query_taxon_id'], value['query_organism_version']) target_filename = "{}_{}.fasta".format( value['target_taxon_id'], value['target_organism_version']) sam_file_name = '{}__{}.out'.format(query_filename, target_filename) file_list.append(sam_file_name) return file_list
def get_table_status(db_config, log_filename): logger = logging_utility.logger_function(__name__, log_filename) # logger.info("\n\t\tGetting Max IDs of each table...............") db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_1 = "SELECT MAX(NA_SEQUENCE_ID) as LAST_ID FROM NASequenceImp" sql_2 = "SELECT MAX(NA_FEATURE_ID) as LAST_ID FROM NAFeatureImp" sql_3 = "SELECT MAX(NA_LOCATION_ID) as LAST_ID FROM NALocation" sql_4 = "SELECT MAX(GENE_INSTANCE_ID) as LAST_ID FROM GeneInstance" sql_5 = "SELECT MAX(PROTEIN_ID) as LAST_ID FROM Protein" row_na_sequence = get_max_table_value(db_dots, sql_1) row_na_feature = get_max_table_value(db_dots, sql_2) row_na_location = get_max_table_value(db_dots, sql_3) row_gene_instance = get_max_table_value(db_dots, sql_4) row_protein = get_max_table_value(db_dots, sql_5) print_str = """Getting Max IDs of each table.. NASequenceImp ID: {} NAFeatureImp ID: {} NALocation ID: {} GeneInstance ID: {} Protein ID: {} """.format(row_na_sequence, row_na_feature, row_na_location, row_gene_instance, row_protein) logger.info(print_str) # print("\t\t NASequenceImp ID is: %d " % row_na_sequence) # print("\t\t NAFeatureimp ID is: %d " % row_na_feature) # print("\t\t NALocation ID is: %d " % row_na_location) # print("\t\t GeneInstance ID is: %d " % row_gene_instance) # print("\t\t Protein ID is: %d " % row_protein) row_list = [ row_na_sequence, row_na_feature, row_na_feature, row_na_feature, row_na_feature ] return row_list
def upload_gal_table_data(db_config, upload_dir, logger): db_name = db_function.DbNames(db_config.db_prefix) file_names = directory_utility.GalFileName(upload_dir) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 1) # For NASequenceImp table sql_1 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE NASequenceImp FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.NaSequenceImp logger.debug(sql_1) db_dots.insert(sql_1) # For NAFeatureImp table sql_2 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE NAFeatureImp FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.NaFeatureImp logger.debug(sql_2) db_dots.insert(sql_2) # For NALocation table sql_3 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE NALocation FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.NaLocation logger.debug(sql_3) db_dots.insert(sql_3) # For GeneInstance table sql_4 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE GeneInstance FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.GeneInstance logger.debug(sql_4) db_dots.insert(sql_4) # For Protein Table sql_5 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE Protein FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.Protein logger.debug(sql_5) db_dots.insert(sql_5)
def create_db_connection(db_config): db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) return db_dots
def create_db_connection_shared_resource(db_config): db_name = db_function.DbNames(db_config.db_prefix) db_sres = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.sres, 0) return db_sres