def get_interpro_scan_last_row_id(db_config):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)
    sql_1 = "SELECT MAX(interpro_scan_ID) AS LAST_ID FROM InterProScan"
    row_protein_instance_feature = get_max_table_value(db_dots, sql_1)
    return row_protein_instance_feature
Esempio n. 2
0
def get_gene_name_map(db_config, taxonomy_id, org_version):

    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)

    sql_query = """select p.name as 'name1',naf.name as 'name2', gi.gene_instance_id from 
Protein p, GeneInstance gi, NAFeatureImp naf, NASequenceImp na where 
gi.gene_instance_id = p.gene_instance_id and
naf.na_feature_id = gi.na_feature_id and
naf.feature_type='mRNA' and
na.na_sequence_id = naf.na_sequence_id and 
na.taxon_id = %s and
na.sequence_version = %s""" % (taxonomy_id, org_version)

    gene_name_dct = {}

    result = db_dots.query(sql_query)
    for i, value in enumerate(result):
        name1 = value['name1']
        name2 = value['name2']
        gene_instance_id = value['gene_instance_id']

        modified_gene_name = modify_gene_name(name1)
        gene_name_dct[modified_gene_name] = gene_instance_id
        gene_name_dct[name2] = gene_instance_id

    return gene_name_dct
def get_sam_alignment_last_row_id(db_config):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)
    sql_1 = "SELECT MAX(SAM_ALIGNMENT_ID) AS LAST_ID FROM SamAlignment"
    row_sam_alignment = get_max_table_value(db_dots, sql_1)
    return row_sam_alignment
def upload_protein_feature_table_data(db_config, upload_dir_names):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 1)

    pfam_upload_file = upload_dir_names.PFam
    signalp_upload_file = upload_dir_names.SignalP
    tmhmm_upload_file = upload_dir_names.TmHmm

    # For HmmPfam table
    sql_1 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE HmmPfam FIELDS TERMINATED BY '\t' OPTIONALLY
       ENCLOSED BY '"' LINES TERMINATED BY '\n'
       (`PFAM_ID`, `GENE_INSTANCE_ID`, `E_VALUE`, `SCORE`, `BIAS`, `ACCESSION_ID`, `DOMAIN_NAME`, `DOMAIN_DESCRIPTION`)
       """ % pfam_upload_file
    db_dots.insert(sql_1)

    # signalp table
    sql_2 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE SignalP FIELDS TERMINATED BY '\t' OPTIONALLY
           ENCLOSED BY '"' LINES TERMINATED BY '\n'""" % signalp_upload_file
    db_dots.insert(sql_2)

    # For Tmhmm table
    sql_3 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE Tmhmm FIELDS TERMINATED BY '\t' OPTIONALLY
           ENCLOSED BY '"' LINES TERMINATED BY '\n'
           (`TMHMM_ID`, `GENE_INSTANCE_ID`, `INSIDE`, `OUTSIDE`, `TMHELIX`)""" % tmhmm_upload_file
    db_dots.insert(sql_3)
Esempio n. 5
0
def create_sequence_file_for_gal_using_dct(db_config, path):
    """
    This function checks and creates fasta file for sam
    It returns the fasta file path associated with its taxonomy id and version
    """
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)

    sql_query = "SELECT * FROM NASequenceImp WHERE SEQUENCE_TYPE_ID =1 order by taxon_id, sequence_version"
    result = db_dots.query(sql_query)

    # collect data into a dict
    dct = defaultdict(lambda: defaultdict(lambda: defaultdict()))
    for i, value in enumerate(result):
        dct[value['TAXON_ID']][value['SEQUENCE_VERSION']][
            value['NA_SEQUENCE_ID']] = value['SEQUENCE']

    file_list = []
    # write the data into the file
    for taxonomy_id, version_dct in dct.items():
        for version, sequence_dct in version_dct.items():
            filename = "{}_{}.fasta".format(taxonomy_id, version)
            file_path = os.path.join(path, filename)
            file_list.append(filename)
            with open(file_path, 'w') as fh:
                for na_sequence_id, sequence in sequence_dct.items():
                    sequence_string = ">{}_{}_{}\n{}\n".format(
                        na_sequence_id, taxonomy_id, version, sequence)
                    fh.write(sequence_string)
    return file_list
Esempio n. 6
0
def get_organism_hierarchy_map(db_config):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)
    sql_query = "SELECT * FROM Organism"
    result = db_dots.query(sql_query)
    hierarchy_dct = defaultdict(lambda: defaultdict())
    for i, value in enumerate(result):
        key = "{}_{}.fasta".format(value['TAXON_ID'], value['VERSION'])
        hierarchy_dct[key]['GENUS'] = value['GENUS']
        hierarchy_dct[key]['PHYLUM'] = value['PHYLUM']

        if value['FAMILY'] == 'None':
            hierarchy_dct[key]['FAMILY'] = None
        else:
            hierarchy_dct[key]['FAMILY'] = value['FAMILY']

        if value['ORDERS'] == 'None':
            hierarchy_dct[key]['ORDERS'] = None
        else:
            hierarchy_dct[key]['ORDERS'] = value['ORDERS']

        if value['CLASS'] == 'None':
            hierarchy_dct[key]['CLASS'] = None
        else:
            hierarchy_dct[key]['CLASS'] = value['CLASS']

    return hierarchy_dct
Esempio n. 7
0
def upload_interpro_data(db_config, interpro_data):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 1)

    # For proteininstancefeature table
    sql_1 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE InterProScan FIELDS TERMINATED BY '\t' OPTIONALLY
        ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % interpro_data
    # print(sql_1)
    db_dots.insert(sql_1)
Esempio n. 8
0
def upload_sam_data(db_config, sam_data):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 1)

    # For NASequenceImp table
    sql_1 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE SamAlignment FIELDS TERMINATED BY '\t' OPTIONALLY
        ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % sam_data
    print(sql_1)
    db_dots.insert(sql_1)
def get_protein_feature_table_status(db_config):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)

    sql_1 = "SELECT MAX(PFAM_ID) as LAST_ID FROM HmmPfam"
    sql_2 = "SELECT MAX(TMHMM_ID) as LAST_ID FROM Tmhmm"
    sql_3 = "SELECT MAX(SIGNALP_ID) as LAST_ID FROM SignalP"

    row_hmm_pfam = get_max_table_value(db_dots, sql_1)
    row_tmhmm = get_max_table_value(db_dots, sql_2)
    row_signalp = get_max_table_value(db_dots, sql_3)
    row_list = [row_hmm_pfam, row_tmhmm, row_signalp]
    return row_list
Esempio n. 10
0
def find_uploaded_sam_files(db_config, path):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)

    sql_query = "SELECT query_taxon_id, query_organism_version, target_taxon_id, target_organism_version " \
                "FROM SamAlignment " \
                "group by query_taxon_id, query_organism_version, target_taxon_id, target_organism_version"
    result = db_dots.query(sql_query)

    file_list = []
    for i, value in enumerate(result):
        query_filename = "{}_{}.fasta".format(value['query_taxon_id'],
                                              value['query_organism_version'])
        target_filename = "{}_{}.fasta".format(
            value['target_taxon_id'], value['target_organism_version'])
        sam_file_name = '{}__{}.out'.format(query_filename, target_filename)
        file_list.append(sam_file_name)
    return file_list
    def __init__(self, db_config, content_dir):
        DefaultSchemaPath.__init__(self, content_dir)

        self.db = db_function.DatabaseCreate(db_config.host,
                                             db_config.db_username,
                                             db_config.db_password)
        self.db_name = db_function.DbNames(db_config.db_prefix)

        self.core = self.db.create(self.db_name.core)
        self.dots = self.db.create(self.db_name.dots)
        self.shared_resource = self.db.create(self.db_name.sres)

        self.db_dots = db_function.Database(db_config.host,
                                            db_config.db_username,
                                            db_config.db_password,
                                            self.db_name.dots, 0)
        self.db_shared_resource = db_function.Database(db_config.host,
                                                       db_config.db_username,
                                                       db_config.db_password,
                                                       self.db_name.sres, 0)
def get_table_status(db_config, log_filename):
    logger = logging_utility.logger_function(__name__, log_filename)
    # logger.info("\n\t\tGetting Max IDs of each table...............")

    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)

    sql_1 = "SELECT MAX(NA_SEQUENCE_ID) as LAST_ID FROM NASequenceImp"
    sql_2 = "SELECT MAX(NA_FEATURE_ID) as LAST_ID FROM NAFeatureImp"
    sql_3 = "SELECT MAX(NA_LOCATION_ID) as LAST_ID FROM NALocation"
    sql_4 = "SELECT MAX(GENE_INSTANCE_ID) as LAST_ID FROM GeneInstance"
    sql_5 = "SELECT MAX(PROTEIN_ID) as LAST_ID FROM Protein"

    row_na_sequence = get_max_table_value(db_dots, sql_1)
    row_na_feature = get_max_table_value(db_dots, sql_2)
    row_na_location = get_max_table_value(db_dots, sql_3)
    row_gene_instance = get_max_table_value(db_dots, sql_4)
    row_protein = get_max_table_value(db_dots, sql_5)

    print_str = """Getting Max IDs of each table..
        NASequenceImp ID: {}
        NAFeatureImp ID: {}
        NALocation ID: {}
        GeneInstance ID: {}
        Protein ID: {}
        """.format(row_na_sequence, row_na_feature, row_na_location,
                   row_gene_instance, row_protein)

    logger.info(print_str)
    # print("\t\t  NASequenceImp ID is: %d " % row_na_sequence)
    # print("\t\t  NAFeatureimp ID is: %d " % row_na_feature)
    # print("\t\t  NALocation ID is: %d " % row_na_location)
    # print("\t\t  GeneInstance ID is: %d " % row_gene_instance)
    # print("\t\t  Protein ID is: %d " % row_protein)

    row_list = [
        row_na_sequence, row_na_feature, row_na_feature, row_na_feature,
        row_na_feature
    ]
    return row_list
def upload_gal_table_data(db_config, upload_dir, logger):

    db_name = db_function.DbNames(db_config.db_prefix)
    file_names = directory_utility.GalFileName(upload_dir)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 1)

    # For NASequenceImp table
    sql_1 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE NASequenceImp FIELDS TERMINATED BY '\t' OPTIONALLY
    ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.NaSequenceImp
    logger.debug(sql_1)
    db_dots.insert(sql_1)

    # For NAFeatureImp table
    sql_2 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE NAFeatureImp FIELDS TERMINATED BY '\t' 
    OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.NaFeatureImp
    logger.debug(sql_2)
    db_dots.insert(sql_2)

    # For NALocation table
    sql_3 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE NALocation FIELDS TERMINATED BY '\t' 
    OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.NaLocation
    logger.debug(sql_3)
    db_dots.insert(sql_3)

    # For GeneInstance table
    sql_4 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE GeneInstance FIELDS TERMINATED BY '\t' OPTIONALLY
    ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.GeneInstance
    logger.debug(sql_4)
    db_dots.insert(sql_4)

    # For Protein Table
    sql_5 = """LOAD DATA LOCAL INFILE '%s' INTO TABLE Protein FIELDS TERMINATED BY '\t' OPTIONALLY
     ENCLOSED BY '"' LINES TERMINATED BY '\n';""" % file_names.Protein
    logger.debug(sql_5)
    db_dots.insert(sql_5)
Esempio n. 14
0
def create_db_connection(db_config):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0)
    return db_dots
Esempio n. 15
0
def create_db_connection_shared_resource(db_config):
    db_name = db_function.DbNames(db_config.db_prefix)
    db_sres = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.sres, 0)
    return db_sres