def load_table_species(conn, species_file): ''' ''' # drop table "species" (if it exists) xlib.Message.print('verbose', 'Droping the table "species" ...\n') xsqlite.drop_species(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "species" xlib.Message.print('verbose', 'Creating the table "species" ...\n') xsqlite.create_species(conn) xlib.Message.print('verbose', 'The table is created.\n') # open the file of species data if species_file.endswith('.gz'): try: species_file_id = gzip.open(species_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', species_file) else: try: species_file_id = open(species_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', species_file) # set the pattern of the data records # format: "species_name";"plaza_id" record_pattern = re.compile(r'^"(.*)";"(.*)"$') # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # read the first record record = species_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.lstrip().startswith('#') and record.strip() != '': # initialize the row data dictionary row_dict = {} # extract data try: mo = record_pattern.match(record) row_dict['species_name'] = mo.group(1).strip().capitalize() row_dict['plaza_species_id'] = mo.group(2).strip().lower() except Exception as e: raise xlib.ProgramException('F006', os.path.basename(species_file), record_counter) # get the taxonomy dictionary of the species name from taxonomy server taxonomy_dict = xlib.get_taxonomy_dict('name', row_dict['species_name']) if taxonomy_dict == {}: row_dict['family_name'] = xlib.get_na() row_dict['phylum_name'] = xlib.get_na() row_dict['kingdom_name'] = xlib.get_na() row_dict['superkingdom_name'] = xlib.get_na() row_dict['tax_id'] = xlib.get_na() else: row_dict['family_name'] = taxonomy_dict['family']['name'] row_dict['phylum_name'] = taxonomy_dict['phylum']['name'] row_dict['kingdom_name'] = taxonomy_dict['kingdom']['name'] row_dict['superkingdom_name'] = taxonomy_dict['superkingdom']['name'] row_dict['tax_id'] = taxonomy_dict['tax_id'] # insert data into table species xsqlite.insert_species_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print('verbose', f'\rProcessed records of species file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = species_file_id.readline() xlib.Message.print('verbose', '\n') # create the index on the table "species" xlib.Message.print('verbose', 'Creating the index on the table "species" ...\n') xsqlite.create_species_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close species file species_file_id.close()
def load_table_blast_5(conn, dataset_id, blast_file): ''' ''' # check if BLAST file is not empty try: blast_file_id = open(blast_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', blast_file) record = blast_file_id.readline() if record == '': return # initialize the iteration counter iteration_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # create table "blast" xlib.Message.print( 'verbose', 'Creating the table "blast" (if it does not exist) ...\n') xsqlite.create_blast(conn) xlib.Message.print('verbose', 'The table is created.\n') # create the index on the table "blast" xlib.Message.print( 'verbose', 'Creating the index on the table "blast" (if it does not exist) ...\n') xsqlite.create_blast_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # delete files from table "blast" corresponding to the repository and dataset identification xlib.Message.print('verbose', 'Deleting previous rows from the table "blast" ...\n') xsqlite.delete_blast_rows(conn, dataset_id) xlib.Message.print('verbose', 'Rows are deleted.\n') # build the complee item tree from BLAST XML file tree = xml.etree.ElementTree.parse(blast_file) root = tree.getroot() # walk the tree and insert data into table "blast" for each iteration-hit-hsp for item_blastoutput_iterations in root.iter(tag='BlastOutput_iterations'): xlib.Message.print( 'verbose', f'-> tag: {item_blastoutput_iterations.tag} - attrib: {item_blastoutput_iterations.attrib} - text: {item_blastoutput_iterations.text}\n' ) # get items "Iteration" for item_iteration in item_blastoutput_iterations.iter( tag='Iteration'): xlib.Message.print( 'verbose', f'---> tag: {item_iteration.tag} - attrib: {item_iteration.attrib} - text: {item_iteration.text}\n' ) # initialize the row data dictionary row_dict = {} row_dict['dataset_id'] = dataset_id # add 1 to iteration counter iteration_counter += 1 # initialize iteration data iteration_iter_num = 0 iteration_query_def = '' # get data of item "Iteration_iter-num" for item_iteration_iter_num in item_iteration.iter( tag='Iteration_iter-num'): xlib.Message.print( 'verbose', f'-----> tag: {item_iteration_iter_num.tag} - attrib: {item_iteration_iter_num.attrib} - text: {item_iteration_iter_num.text}\n' ) row_dict['iteration_iter_num'] = int( item_iteration_iter_num.text) # get data of item "Iteration_query-def" for item_iteration_query_def in item_iteration.iter( tag='Iteration_query-def'): xlib.Message.print( 'verbose', f'-----> tag: {item_iteration_query_def.tag} - attrib: {item_iteration_query_def.attrib} - text: {item_iteration_query_def.text}\n' ) row_dict['iteration_query_def'] = item_iteration_query_def.text # get items "Iteration_hits" for item_iteration_hits in item_iteration.iter( tag='Iteration_hits'): xlib.Message.print( 'verbose', f'-----> tag: {item_iteration_hits.tag} - attrib: {item_iteration_hits.attrib} - text: {item_iteration_hits.text}\n' ) # get items "Hit" for item_hit in item_iteration_hits.iter(tag='Hit'): xlib.Message.print( 'verbose', f'-------> tag: {item_hit.tag} - attrib: {item_hit.attrib} - text: {item_hit.text}' ) # initialize hit data row_dict['hit_num'] = 0 row_dict['hit_id'] = xlib.get_na() row_dict['hit_def'] = xlib.get_na() row_dict['hit_accession'] = xlib.get_na() # get data of item "Hit_num" for item_hit_num in item_hit.iter(tag='Hit_num'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_num.tag} - attrib: {item_hit_num.attrib} - text: {item_hit_num.text}\n' ) row_dict['hit_num'] = int(item_hit_num.text) # get data of item "Hit_id" for item_hit_id in item_hit.iter(tag='Hit_id'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_id.tag} - attrib: {item_hit_id.attrib} - text: {item_hit_id.text}\n' ) row_dict['hit_id'] = item_hit_id.text # get data of item "Hit_def" for item_hit_def in item_hit.iter(tag='Hit_def'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_def.tag} - attrib: {item_hit_def.attrib} - text: {item_hit_def.text}\n' ) try: row_dict['hit_def'] = item_hit_def.text.replace( "'", '|').replace(';', ',') except: row_dict['hit_def'] = item_hit_def.text # get data of item "Hit_accession" for item_hit_accession in item_hit.iter( tag='Hit_accession'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_accession.tag} - attrib: {item_hit_accession.attrib} - text: {item_hit_accession.text}\n' ) row_dict['hit_accession'] = item_hit_accession.text # get items "Hit_hsps" for item_hit_hsps in item_hit.iter(tag='Hit_hsps'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_hsps.tag} - attrib: {item_hit_hsps.attrib} - text: {item_hit_hsps.text}\n' ) # get items "Hsp" for item_hsp in item_hit.iter(tag='Hsp'): xlib.Message.print( 'verbose', f'-----------> tag: {item_hsp.tag} - attrib: {item_hsp.attrib} - text: {item_hsp.text}\n' ) # initialize hsp data row_dict['hsp_num'] = 0 row_dict['hsp_evalue'] = 0. row_dict['hsp_identity'] = 0 row_dict['hsp_positive'] = 0 row_dict['hsp_gaps'] = 0 row_dict['hsp_align_len'] = 0 row_dict['hsp_qseq'] = '' # get data of item "Hsp_num" for item_hsp_num in item_hsp.iter(tag='Hsp_num'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_num.tag} - attrib: {item_hsp_num.attrib} - text: {item_hsp_num.text}\n' ) row_dict['hsp_num'] = int(item_hsp_num.text) # get data of item "Hsp_evalue" for item_hsp_evalue in item_hsp.iter( tag='Hsp_evalue'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_evalue.tag} - attrib: {item_hsp_evalue.attrib} - text: {item_hsp_evalue.text}\n' ) row_dict['hsp_evalue'] = float( item_hsp_evalue.text) # get data of item "Hsp_identity" for item_hsp_identity in item_hsp.iter( tag='Hsp_identity'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_identity.tag} - attrib: {item_hsp_identity.attrib} - text: {item_hsp_identity.text}\n' ) row_dict['hsp_identity'] = int( item_hsp_identity.text) # get data of item "Hsp_positive" for item_hsp_positive in item_hsp.iter( tag='Hsp_positive'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_positive.tag} - attrib: {item_hsp_positive.attrib} - text: {item_hsp_positive.text}\n' ) row_dict['hsp_positive'] = int( item_hsp_positive.text) # get data of item "Hsp_gaps" for item_hsp_gaps in item_hsp.iter(tag='Hsp_gaps'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_gaps.tag} - attrib: {item_hsp_gaps.attrib} - text: {item_hsp_gaps.text}\n' ) row_dict['hsp_gaps'] = int(item_hsp_gaps.text) # get data of item "Hsp_align-len" for item_hsp_align_len in item_hsp.iter( tag='Hsp_align-len'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_align_len.tag} - attrib: {item_hsp_align_len.attrib} - text: {item_hsp_align_len.text}\n' ) row_dict['hsp_align_len'] = int( item_hsp_align_len.text) # get data of item "Hsp_qseq" for item_hsp_qseq in item_hsp.iter(tag='Hsp_qseq'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_qseq.tag} - attrib: {item_hsp_qseq.attrib} - text: {item_hsp_qseq.text}\n' ) row_dict['hsp_qseq'] = item_hsp_qseq.text # insert data into table "blast" xsqlite.insert_blast_row(conn, row_dict) inserted_row_counter += 1 # print iteration counter xlib.Message.print( 'verbose', f'\rIterations: {iteration_counter} - Inserted rows: {inserted_row_counter}' ) xlib.Message.print('verbose', '\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n')
def load_table_datasets(conn, dataset_file): ''' ''' # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # set the pattern of the data records # format: "repository_id";"dataset_id";"dataset_name";"ftp_adress" record_pattern = re.compile(r'^"(.*)";"(.*)";"(.*)";"(.*)"$') # drop table "datasets" xlib.Message.print('verbose', 'Droping the table "datasets" ...\n') xsqlite.drop_datasets(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "datasets" xlib.Message.print('verbose', 'Creating the table "datasets" ...\n') xsqlite.create_datasets(conn) xlib.Message.print('verbose', 'The table is created.\n') # open the file of datasets if dataset_file.endswith('.gz'): try: dataset_file_id = gzip.open(dataset_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', dataset_file) else: try: dataset_file_id = open(dataset_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', dataset_file) # read the first record record = dataset_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.lstrip().startswith('#') and record.strip() != '': # initialize the row data dictionary row_dict = {} # extract data try: mo = record_pattern.match(record) row_dict['dataset_id'] = mo.group(1).strip().lower() row_dict['dataset_name'] = mo.group(2).strip() row_dict['repository_id'] = mo.group(3).strip().lower() row_dict['ftp_adress'] = mo.group(4).strip() except Exception as e: raise xlib.ProgramException('F006', os.path.basename(dataset_file), record_counter) # review null values of "ftp_adress" if row_dict['ftp_adress'] == '': row_dict['ftp_adress'] = xlib.get_na() # insert data into table "datasets" xsqlite.insert_datasets_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print('verbose', f'\rProcessed records of dataset file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = dataset_file_id.readline() xlib.Message.print('verbose', '\n') # create the index on the table "datasets" xlib.Message.print('verbose', 'Creating the index on the table "datasets" ...\n') xsqlite.create_datasets_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close dataset file dataset_file_id.close()
def load_genomic_features(conn, species_name, gff_file, gff_format): ''' ''' # create table "genomic_features" (if not exists) xlib.Message.print( 'verbose', 'Creating the table "genomic_features" (if it does not exist) ...\n') xsqlite.create_genomic_features(conn) xlib.Message.print('verbose', 'The table is created.\n') # create index "genomic_features_index" with columns "dataset_id" and "gene_id" (if not exists) xlib.Message.print( 'verbose', 'Creating the index on the table "genomic_features" (if it does not exist) ...\n' ) xsqlite.create_genomic_features_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # delete files from table "genomic_features" corresponding to the dataset and species identifications xlib.Message.print( 'verbose', 'Deleting previous rows from the table "genomic_features" ...\n') xsqlite.delete_genomic_features_rows(conn, species_name) xlib.Message.print('verbose', 'Rows are deleted.\n') # open the GFF file if gff_file.endswith('.gz'): try: gff_file_id = gzip.open(gff_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', gff_file) else: try: gff_file_id = open(gff_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', gff_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # initialize the first header record control first_header_record = True # read the first record record = gff_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process the header records if record.startswith('#'): if first_header_record == True and gff_format == 'GFF3': if not record.startswith('##gff-version 3'): raise xlib.ProgramException('F005', os.path.basename(gff_file), 'GFF3') first_header_record = False # process data records else: # initialize the row data dictionary row_dict = {} row_dict['species_name'] = species_name # extract data # record format: seqid\tsource\ttype\tstart\tend\tscore\tstrand\tphase\tattributes data_list = [] pos_1 = 0 for pos_2 in [i for i, chr in enumerate(record) if chr == '\t']: data_list.append(record[pos_1:pos_2].strip()) pos_1 = pos_2 + 1 data_list.append(record[pos_1:].strip('\n').strip()) try: row_dict['seq_id'] = data_list[0] row_dict['type'] = data_list[2] row_dict['start'] = data_list[3] row_dict['end'] = data_list[4] attributes = data_list[8] except Exception as e: raise xlib.ProgramException('F006', os.path.basename(gff_file), record_counter) # only the types "gene", "CDS" and "mRNA" have to be inserted in the table "genomic_features" if row_dict['type'] in ['gene', 'CDS', 'mRNA']: # check "start" try: row_dict['start'] = int(row_dict['start']) except Exception as e: raise xlib.ProgramException('D001', 'start', os.path.basename(gff_file), record_counter) # check "end" try: row_dict['end'] = int(row_dict['end']) except Exception as e: raise xlib.ProgramException('D001', 'stop', os.path.basename(gff_file), record_counter) # get "gene_id" data from "attributes" row_dict['gene_id'] = xlib.get_na() literal = 'GeneID:' pos_1 = attributes.find(literal) if pos_1 > -1: pos_comma = attributes.find(',', pos_1 + len(literal) + 1) pos_semicolon = attributes.find(';', pos_1 + len(literal) + 1) if pos_comma == -1: pos_2 = pos_semicolon elif pos_semicolon == -1: pos_2 = pos_comma else: pos_2 = min(pos_comma, pos_semicolon) row_dict['gene_id'] = attributes[pos_1 + len(literal):pos_2] # get "genbank_id" data from "attributes" row_dict['genbank_id'] = xlib.get_na() literal = 'Genbank:' pos_1 = attributes.find(literal) if pos_1 > -1: pos_comma = attributes.find(',', pos_1 + len(literal) + 1) pos_semicolon = attributes.find(';', pos_1 + len(literal) + 1) if pos_comma == -1: pos_2 = pos_semicolon elif pos_semicolon == -1: pos_2 = pos_comma else: pos_2 = min(pos_comma, pos_semicolon) row_dict['genbank_id'] = attributes[pos_1 + len(literal):pos_2] # get "gene" data from "attributes" row_dict['gene'] = xlib.get_na() literal = 'gene=' pos_1 = attributes.find(literal) if pos_1 > -1: pos_2 = attributes.find(';', pos_1 + len(literal) + 1) row_dict['gene'] = attributes[pos_1 + len(literal):pos_2] # get "protein_id" data from "attributes" row_dict['protein_id'] = xlib.get_na() literal = 'protein_id=' pos_1 = attributes.find(literal) if pos_1 > -1: pos_2 = attributes.find(';', pos_1 + len(literal) + 1) if pos_2 > -1: row_dict['protein_id'] = attributes[pos_1 + len(literal):pos_2] else: row_dict['protein_id'] = attributes[pos_1 + len(literal):] # get "transcript_id" data from "attributes" row_dict['transcript_id'] = xlib.get_na() literal = 'transcript_id=' pos_1 = attributes.find(literal) if pos_1 > -1: pos_2 = attributes.find(';', pos_1 + len(literal) + 1) if pos_2 > -1: row_dict['transcript_id'] = attributes[pos_1 + len(literal ):pos_2] else: row_dict['transcript_id'] = attributes[pos_1 + len(literal):] # get "product" data from "attributes" row_dict['product'] = xlib.get_na() literal = 'product=' pos_1 = attributes.find(literal) if pos_1 > -1: pos_2 = attributes.find(';', pos_1 + len(literal) + 1) row_dict['product'] = attributes[pos_1 + len(literal):pos_2] # change quotation marks, semicolons and %2C in "product" row_dict['product'] = row_dict['product'].replace( "'", '|').replace(';', ',').replace('%2C', ',') # insert data into table "genomic_features" xsqlite.insert_genomic_features_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rProcessed records of GFF file: {record_counter} - Inserted rows: {inserted_row_counter}' ) # read the next record record = gff_file_id.readline() xlib.Message.print('verbose', '\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close GFF file gff_file_id.close()
def load_table_go_cross_references(conn, ec2go_file, kegg2go_file, metacyc2go_file, interpro2go_file): ''' ''' # drop table "go_cross_references" (if it exists) xlib.Message.print('verbose', 'Droping the table "go_cross_references" ...\n') xsqlite.drop_go_cross_references(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "go_cross_references" xlib.Message.print('verbose', 'Creating the table "go_cross_references" ...\n') xsqlite.create_go_cross_references(conn) xlib.Message.print('verbose', 'The table is created.\n') # initialize the row data dictionary and the external database name and description row_dict = {} row_dict['external_db'] = 'ec' row_dict['external_desc'] = xlib.get_na() # open the ec2go file if ec2go_file.endswith('.gz'): try: ec2go_file_id = gzip.open(ec2go_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', ec2go_file) else: try: ec2go_file_id = open(ec2go_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', ec2go_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # read the first record record = ec2go_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.startswith('!'): # extract data # record format: ec_id > go_term ; go_id gt_position = record.find('>') semicolon_position = record.find(';') if gt_position == -1 or semicolon_position == -1 or gt_position > semicolon_position: raise xlib.ProgramException('F006', os.path.basename(ec2go_file), record_counter) row_dict['external_id'] = record[:gt_position].strip() row_dict['go_term'] = record[gt_position + 1:semicolon_position].strip() row_dict['go_id'] = record[semicolon_position + 1:].strip('\n').strip() # remove database name from text row_dict['go_id'] = row_dict['go_id'].replace('GO:', '') row_dict['go_term'] = row_dict['go_term'].replace('GO:', '') row_dict['external_id'] = row_dict['external_id'].replace( 'EC:', '') # change quotation marks and semicolons in "go_term" row_dict['go_term'] = row_dict['go_term'].replace("'", '|').replace( ';', ',') # insert data into table "go_cross_references" xsqlite.insert_go_cross_references_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rec2go file: {record_counter} processed records - Inserted rows: {inserted_row_counter}' ) # read the next record record = ec2go_file_id.readline() xlib.Message.print('verbose', '\n') # close ec2go file ec2go_file_id.close() # initialize the row data dictionary and the external database name and description row_dict = {} row_dict['external_db'] = 'kegg' row_dict['external_desc'] = xlib.get_na() # open the kegg2go file if kegg2go_file.endswith('.gz'): try: kegg2go_file_id = gzip.open(kegg2go_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', kegg2go_file) else: try: kegg2go_file_id = open(kegg2go_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', kegg2go_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # read the first record record = kegg2go_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.startswith('!'): # extract data # record format: kegg_id > go_term ; go_id gt_position = record.find('>') semicolon_position = record.find(';') if gt_position == -1 or semicolon_position == -1 or gt_position > semicolon_position: raise xlib.ProgramException('F006', os.path.basename(kegg2go_file), record_counter) row_dict['external_id'] = record[:gt_position].strip() row_dict['go_term'] = record[gt_position + 1:semicolon_position].strip() row_dict['go_id'] = record[semicolon_position + 1:].strip('\n').strip() # remove database name from text row_dict['go_id'] = row_dict['go_id'].replace('GO:', '') row_dict['go_term'] = row_dict['go_term'].replace('GO:', '') row_dict['external_id'] = row_dict['external_id'].replace( 'KEGG:', '') # change quotation marks and semicolons in "go_term" row_dict['go_term'] = row_dict['go_term'].replace("'", '|').replace( ';', ',') # insert data into table "go_cross_references" xsqlite.insert_go_cross_references_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rkegg2go file: {record_counter} processed records - Inserted rows: {inserted_row_counter}' ) # read the next record record = kegg2go_file_id.readline() xlib.Message.print('verbose', '\n') # close kegg2go file kegg2go_file_id.close() # initialize the row data dictionary and the external database name and description row_dict = {} row_dict['external_db'] = 'metacyc' row_dict['external_desc'] = xlib.get_na() # open the metacyc2go file if metacyc2go_file.endswith('.gz'): try: metacyc2go_file_id = gzip.open(metacyc2go_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', metacyc2go_file) else: try: metacyc2go_file_id = open(metacyc2go_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', metacyc2go_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # read the first record record = metacyc2go_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.startswith('!'): # extract data # record format: metacyc_id > go_term ; go_id gt_position = record.find('>') semicolon_position = record.find(';') if gt_position == -1 or semicolon_position == -1 or gt_position > semicolon_position: raise xlib.ProgramException('F006', os.path.basename(metacyc2go_file), record_counter) row_dict['external_id'] = record[:gt_position].strip() row_dict['go_term'] = record[gt_position + 1:semicolon_position].strip() row_dict['go_id'] = record[semicolon_position + 1:].strip('\n').strip() # remove database name from text row_dict['go_id'] = row_dict['go_id'].replace('GO:', '') row_dict['go_term'] = row_dict['go_term'].replace('GO:', '') row_dict['external_id'] = row_dict['external_id'].replace( 'MetaCyc:', '') # change quotation marks and semicolons in "go_term" row_dict['go_term'] = row_dict['go_term'].replace("'", '|').replace( ';', ',') # insert data into table "go_cross_references" xsqlite.insert_go_cross_references_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rmetacyc2go file: {record_counter} processed records - Inserted rows: {inserted_row_counter}' ) # read the next record record = metacyc2go_file_id.readline() xlib.Message.print('verbose', '\n') # close metacyc2go file metacyc2go_file_id.close() # initialize the row data dictionary and the external database name row_dict = {} row_dict['external_db'] = 'interpro' # open the interpro file if interpro2go_file.endswith('.gz'): try: interpro2go_file_id = gzip.open(interpro2go_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', interpro2go_file) else: try: interpro2go_file_id = open(interpro2go_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', interpro2go_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # read the first record record = interpro2go_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.startswith('!'): # extract data # record format: interpro_id interpro_desc > go_term ; go_id first_space_position = record.find(' ') gt_position = record.find('>') semicolon_position = record.find(';') if first_space_position == -1 or gt_position == -1 or semicolon_position == -1 or first_space_position > gt_position or gt_position > semicolon_position: raise xlib.ProgramException('F006', os.path.basename(interpro2go_file), record_counter) row_dict['external_id'] = record[:first_space_position].strip() row_dict['external_desc'] = record[first_space_position + 1:gt_position].strip() row_dict['go_term'] = record[gt_position + 1:semicolon_position].strip() row_dict['go_id'] = record[semicolon_position + 1:].strip('\n').strip() # remove database name from text row_dict['go_id'] = row_dict['go_id'].replace('GO:', '') row_dict['go_term'] = row_dict['go_term'].replace('GO:', '') row_dict['external_id'] = row_dict['external_id'].replace( 'InterPro:', '') # change quotation marks and semicolons in "go_term" and "external_desc" row_dict['go_term'] = row_dict['go_term'].replace("'", '|').replace( ';', ',') row_dict['external_desc'] = row_dict['external_desc'].replace( "'", '|').replace(';', ',') # insert data into table "go_cross_references" xsqlite.insert_go_cross_references_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rinterpro2go file: {record_counter} processed records - Inserted rows: {inserted_row_counter}' ) # read the next record record = interpro2go_file_id.readline() xlib.Message.print('verbose', '\n') # close interpro2go file interpro2go_file_id.close() # create the index on the table "go_cross_references" xlib.Message.print( 'verbose', 'Creating the index on the table "go_cross_references" ...\n') xsqlite.create_go_cross_references_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n')
def load_table_go_ontology(conn, ontology_file): ''' ''' # drop table "go_ontology" (if it exists) xlib.Message.print('verbose', 'Droping the table "go_ontology" ...\n') xsqlite.drop_go_ontology(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "go_ontology" xlib.Message.print('verbose', 'Creating the table "go_ontology" ...\n') xsqlite.create_go_ontology(conn) xlib.Message.print('verbose', 'The table is created.\n') # initialize the row data dictionary and the external database name and description row_dict = {} row_dict['external_db'] = 'ec' row_dict['external_desc'] = xlib.get_na() # open the ontology file if ontology_file.endswith('.gz'): try: ontology_file_id = gzip.open(ontology_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', ontology_file) else: try: ontology_file_id = open(ontology_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', ontology_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # read the first record record = ontology_file_id.readline() # while there are records and they are the header while record != '' and not record.startswith('[Term]'): # add 1 to record counter record_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rOntology file: {record_counter} processed records - Inserted rows: {inserted_row_counter}' ) # read the next record record = ontology_file_id.readline() # if there is a first term block if record.startswith('[Term]'): # while there are records while record != '': # add 1 to record counter record_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rOntology file: {record_counter} processed records - Inserted rows: {inserted_row_counter}' ) # read the next record record = ontology_file_id.readline() # initialize the row dictionary row_dict = {} row_dict['go_id'] = '' row_dict['go_name'] = '' row_dict['namespace'] = '' alt_id_list = [] # while there are records and they are term details while record != '' and not record.startswith('[Term]'): # add 1 to record counter record_counter += 1 # get the GO identification if record.startswith('id:'): row_dict['go_id'] = record[len('id: GO:'):].strip() # get the GO name if record.startswith('name:'): row_dict['go_name'] = record[len('name:'):].strip() # change quotation marks and semicolons in "go_name" row_dict['go_name'] = row_dict['go_name'].replace( "'", '|').replace(';', ',') # get the namespace if record.startswith('namespace:'): row_dict['namespace'] = record[len('namespace:'):].strip() # change quotation marks and semicolons in "namespace" row_dict['namespace'] = row_dict['namespace'].replace( "'", '|').replace(';', ',').replace('_', ' ') # get the alternative identificationnamespace if record.startswith('alt_id:'): alt_id_list.append(record[len('alt_id: GO:'):].strip()) # print record counter xlib.Message.print( 'verbose', f'\rOntology file: {record_counter} processed records - Inserted rows: {inserted_row_counter}' ) # read the next record record = ontology_file_id.readline() # break the loop when typedef sections start if record.startswith('[Typedef]'): break # insert data into table "go_ontology" xsqlite.insert_go_ontology_row(conn, row_dict) inserted_row_counter += 1 for alt_id in alt_id_list: row_dict['go_id'] = alt_id xsqlite.insert_go_ontology_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rOntology file: {record_counter} processed records - Inserted rows: {inserted_row_counter}' ) # break the loop when typedef sections start if record.startswith('[Typedef]'): break xlib.Message.print('verbose', '\n') # close ontology file ontology_file_id.close() # create the index on the table "go_ontology" xlib.Message.print('verbose', 'Creating the index on the table "go_ontology" ...\n') xsqlite.create_go_ontology_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n')
def load_table_plaza_interpro(conn, dataset_id, species_id, interpro_file, plaza_species_id_list): ''' ''' # create table "plaza_interpro" (if not exists) xlib.Message.print( 'verbose', 'Creating the table "plaza_interpro" (if it does not exist) ...\n') xsqlite.create_plaza_interpro(conn) xlib.Message.print('verbose', 'The table is created.\n') # create index "plaza_interpro_index" with columns "dataset_id" and "gene_id" (if not exists) xlib.Message.print( 'verbose', 'Creating the index on the table "plaza_interpro" (if it does not exist) ...\n' ) xsqlite.create_plaza_interpro_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # delete files from table "plaza_interpro" corresponding to the dataset and species identifications xlib.Message.print( 'verbose', 'Deleting previous rows from the table "plaza_interpro" ...\n') xsqlite.delete_plaza_interpro_rows(conn, dataset_id, species_id) xlib.Message.print('verbose', 'Rows are deleted.\n') # open the InterPro file if interpro_file.endswith('.gz'): try: interpro_file_id = gzip.open(interpro_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', interpro_file) else: try: interpro_file_id = open(interpro_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', interpro_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # initialize the header record control header_record = True # read the first record record = interpro_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process the header record for Gymno PLAZA 1.0 if dataset_id in ['gymno_01'] and header_record: header_record = False # process data records else: # initialize the row data dictionary row_dict = {} row_dict['dataset_id'] = dataset_id # extract data Gymno PLAZA 1.0 if dataset_id in ['gymno_01']: # record format: "id";"motif_id";"species";"gene_id";"start";"stop";"score";"comment";"desc" data_list = [] begin = 0 for end in [i for i, chr in enumerate(record) if chr == ';']: data_list.append(record[begin:end].strip('"')) begin = end + 1 data_list.append(record[begin:].strip('\n').strip('"')) try: row_dict['id'] = data_list[0] row_dict['motif_id'] = data_list[1] row_dict['plaza_species_id'] = data_list[2] row_dict['gene_id'] = data_list[3] row_dict['start'] = data_list[4] row_dict['stop'] = data_list[5] row_dict['score'] = data_list[6] comment = data_list[7] row_dict['desc'] = data_list[8] except Exception as e: raise xlib.ProgramException( 'F006', os.path.basename(interpro_file), record_counter) # extract data Dicots PLAZA 4.0 and Monocots PLAZA 4.0 (for non-comment records) elif not record.startswith('#') and dataset_id in [ 'dicots_04', 'monocots_04' ]: # record format: gene_id\tspecies\tmotif_id\tdescription\tstart\tstop\tscore\tcomment data_list = [] start = 0 for end in [i for i, chr in enumerate(record) if chr == '\t']: data_list.append(record[start:end].strip()) start = end + 1 data_list.append(record[start:].strip('\n').strip()) try: row_dict['gene_id'] = data_list[0] row_dict['plaza_species_id'] = data_list[1] row_dict['motif_id'] = data_list[2] row_dict['desc'] = data_list[3] row_dict['start'] = data_list[4] row_dict['stop'] = data_list[5] row_dict['score'] = data_list[6] comment = data_list[7] row_dict['id'] = 0 except Exception as e: raise xlib.ProgramException( 'F006', os.path.basename(interpro_file), record_counter) # if PLAZA species identification has value not null (for non-comment records) if not record.startswith( '#') and row_dict['plaza_species_id'] != '': # check plaza_species_id if row_dict['plaza_species_id'] not in plaza_species_id_list: raise xlib.ProgramException( 'L002', 'species', os.path.basename(interpro_file), record_counter) # check "start" try: row_dict['start'] = int(row_dict['start']) except Exception as e: raise xlib.ProgramException( 'D001', 'start', os.path.basename(interpro_file), record_counter) # check "end" try: row_dict['stop'] = int(row_dict['stop']) except Exception as e: raise xlib.ProgramException( 'D001', 'stop', os.path.basename(interpro_file), record_counter) # check "score" try: row_dict['score'] = float(row_dict['score']) except Exception as e: raise xlib.ProgramException( 'D002', 'score', os.path.basename(interpro_file), record_counter) # split "comment" in "source" and "domain_id" # "comment" format: source=x,domainId=x pos1 = comment.find('source=') if pos1 >= 0: pos2 = comment.find(',domainId=') row_dict['source'] = comment[pos1 + 7:pos2].strip() row_dict['domain_id'] = comment[pos2 + 10:].strip() else: row_dict['source'] = xlib.get_na() row_dict['domain_id'] = xlib.get_na() # change quotation marks and semicolons in "desc" row_dict['desc'] = row_dict['desc'].replace("'", '|').replace( ';', ',') # insert data into table "plaza_interpro" xsqlite.insert_plaza_interpro_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print( 'verbose', f'\rProcessed records of InterPro file: {record_counter} - Inserted rows: {inserted_row_counter}' ) # read the next record record = interpro_file_id.readline() xlib.Message.print('verbose', '\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close InterPro file interpro_file_id.close()
def extract_gff_rnas(gff_file, gff_format, genome_file, rna_file, tvi_list): ''' Extract RNA sequences from a GFF file and its corresponding genome FASTA file. ''' # initialize RNA sequences per seq_id dictionary rna_seq_id_dict = {} # open the input GFF file if gff_file.endswith('.gz'): try: gff_file_id = gzip.open(gff_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', gff_file) else: try: gff_file_id = open(gff_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', gff_file) # initialize counters record_counter = 0 rna_counter = 0 # read the first record record = gff_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.startswith('#'): # extract data # record format: seq_id\tsource\ttype\tstart\tend\tscore\tstrand\tphase\tattributes data_list = [] pos_1 = 0 for pos_2 in [i for i, chr in enumerate(record) if chr == '\t']: data_list.append(record[pos_1:pos_2].strip()) pos_1 = pos_2 + 1 data_list.append(record[pos_1:].strip('\n').strip()) try: seq_id = data_list[0] type = data_list[2] start = int(data_list[3]) end = int(data_list[4]) attributes = data_list[8] except Exception as e: raise xlib.ProgramException(e, 'F009', os.path.basename(gff_file), record_counter) # only the type "mRNA"is considerer if type == 'mRNA': # add 1 to RNA counter rna_counter += 1 # get "gene" data from "attributes" gene = xlib.get_na() literal = 'gene=' pos_1 = attributes.find(literal) if pos_1 > -1: pos_2 = attributes.find(';', pos_1 + len(literal) + 1) gene = attributes[pos_1 + len(literal):pos_2] # add RNA sequence to RNA sequences per seq_id dictionary if rna_seq_id_dict.get(seq_id, {}) == {}: rna_seq_id_dict[seq_id] = {} key = f'{start}-{end}' rna_seq_id_dict[seq_id][key] = { 'start': start, 'end': end, 'gene': gene } # print record counter xlib.Message.print( 'verbose', f'\rGFF file records... {record_counter:8d} - RNA seqs... {rna_counter:8d}' ) # read the next record record = gff_file_id.readline() xlib.Message.print('verbose', '\n') for x in tvi_list: xlib.Message.print('trace', f'RNA seq in {x}: {rna_seq_id_dict.get(x, {})}') # close the input GFF file gff_file_id.close() # open the genome file if genome_file.endswith('.gz'): try: genome_file_id = gzip.open(genome_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', genome_file) else: try: genome_file_id = open(genome_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', genome_file) # open the output FASTA file with the RNA sequences if rna_file.endswith('.gz'): try: rna_file_id = gzip.open(rna_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', rna_file) else: try: rna_file_id = open(rna_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', rna_file) # initialize record counters genomic_seq_counter = 0 rna_seq_counter = 0 # read the first record of genome file record = genome_file_id.readline() # while there are records in genome file while record != '': # process the head record if record.startswith('>'): # add 1 to the read sequence counter genomic_seq_counter += 1 # extract the identification space_pos = record[1:].find(' ') if space_pos > -1: id = record[1:space_pos + 1] else: id = record[1:].strip('\n') # initialize the sequence seq = '' # read the next record record = genome_file_id.readline() else: # control the FASTA format raise xlib.ProgramException('F006', genome_file, 'FASTA') # while there are records and they are sequence while record != '' and not record.startswith('>'): # concatenate the record to the sequence seq += record.strip() # read the next record of genome file record = genome_file_id.readline() # get RNA sequences corresponding to this genomic sequence rna_dict = rna_seq_id_dict.get(id, {}) # if there are RNAs corresponding to this genomic sequence if rna_dict != {}: # for each RNA for key in rna_dict.keys(): # get the RNA data start = rna_dict[key]['start'] end = rna_dict[key]['end'] gene = rna_dict[key]['gene'] # write the identification record rna_file_id.write( f'>seq_id: {id} - start: {start} - end: {end} - gene: {gene}\n' ) # wite the sequence (start and end have 1-base offset in GFF file) rna_file_id.write(f'{seq[start - 1:end]}\n') # add 1 to the RNA sequence counter rna_seq_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rGenome seqs... {genomic_seq_counter:8d} - RNA seqs... {rna_seq_counter:8d}' ) # close files genome_file_id.close() rna_file_id.close() # print OK message xlib.Message.print( 'verbose', f'\nThe file {os.path.basename(rna_file)} containing FASTA RNA sequences in cDNA format cDNA is created.' )