def get_sample_type_dict(conn): ''' Get a dictionary with the type corresponding to samples. ''' # initialize the dictionary sample_type_dict = xlib.NestedDefaultDict() # query sentence = ''' SELECT a.sample_id, 'PROGENY' "type" FROM vcf_samples a WHERE a.mother_id != 'NONE' UNION SELECT b.sample_id, 'MOTHER' "type" FROM vcf_samples b WHERE b.sample_id IN (SELECT DISTINCT c.mother_id FROM vcf_samples c) UNION SELECT d.sample_id, 'ADULT' "type" FROM vcf_samples d WHERE d.mother_id == 'NONE' AND d.sample_id NOT IN (SELECT DISTINCT e.mother_id FROM vcf_samples e) ORDER BY 1; ''' try: rows = conn.execute(sentence) except Exception as e: raise xlib.ProgramException(e, 'B002', sentence, conn) # add row data to the dictionary for row in rows: sample_type_dict[row[0]] = {'sample_id': row[0], 'type': row[1]} # return the dictionary return sample_type_dict
def get_vcf_alleles_dict(conn): ''' Get a dictionary corresponding to rows of allele data. ''' # initialize the dictionary allele_dict = xlib.NestedDefaultDict() # query sentence = f''' select variant_id, allele_id, bases, structure_allele_id from vcf_alleles; ''' try: rows = conn.execute(sentence) except Exception as e: raise xlib.ProgramException(e, 'B002', sentence, conn) # add row data to the dictionary for row in rows: allele_dict[row[0]][row[1]] = { 'bases': row[2], 'structure_allele_id': row[3] } # return the dictionary return allele_dict
def query_species_and_type_allele_frequencies(conn, md_symbol): ''' Get a dictionary corresponding to rows of individual allele frequencies per species and type of variant per species (alleles with missing data and adult individuals are not considered). ''' # initialize the dictionary species_and_type_allele_frequency_dict = xlib.NestedDefaultDict() # query sentence = f''' SELECT a.variant_id, b.species_id, b.type, a.allele_id, SUM(a.frecuency) FROM vcf_samples_alleles a, vcf_samples b WHERE a.sample_id = b.sample_id AND a.allele_id <> '{md_symbol}' AND b.type <> 'ADULT' GROUP BY a.variant_id, b.species_id, b.type, a.allele_id ORDER BY a.variant_id, b.species_id, b.type, a.allele_id; ''' try: rows = conn.execute(sentence) except Exception as e: raise xlib.ProgramException(e, 'B002', sentence, conn) # add row data to the dictionary for row in rows: species_and_type_allele_frequency_dict[row[0]][row[1]][row[2]][ row[3]] = { 'frecuency_sum': row[4] } # return the dictionary return species_and_type_allele_frequency_dict
def build_trapid_annotation(transcripts_with_go_file, transcripts_with_gf_file, transcripts_with_ko_file, annotation_file): ''' Build functional annotation data corresponding to a TRAPID run. ''' # initialize the annotation dictionary annotation_dict = xlib.NestedDefaultDict() # get GO annotations annotation_dict = get_go_annotations(transcripts_with_go_file, annotation_dict) # get GF annotations annotation_dict = get_gf_annotations(transcripts_with_gf_file, annotation_dict) # get KO annotations annotation_dict = get_ko_annotations(transcripts_with_ko_file, annotation_dict) # open the output annotation file if annotation_file.endswith('.gz'): try: annotation_file_id = gzip.open(annotation_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', annotation_file) else: try: annotation_file_id = open(annotation_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', annotation_file) # write header record annotation_file_id.write('"transcript_id";"go_id";"go_desc";"gf_id";"ko_id";"ko_desc"\n') ## write transcript records for transcript_id in sorted(annotation_dict.keys()): go_id = annotation_dict.get(transcript_id, {}).get('go_id', '') go_desc = annotation_dict.get(transcript_id, {}).get('go_desc', '') gf_id = annotation_dict.get(transcript_id, {}).get('gf_id', '') ko_id = annotation_dict.get(transcript_id, {}).get('ko_id', '') ko_desc = annotation_dict.get(transcript_id, {}).get('ko_desc', '') annotation_file_id.write(f'"{transcript_id}";"{go_id}";"{go_desc}";"{gf_id}";"{ko_id}";"{ko_desc}"\n') # close annotation file annotation_file_id.close()
def calculate_trapid_go_stats(annotation_file, go_ontology_dict, output_dir): ''' Calculate GO term statistics of a TRAPID annotation file. ''' # initialize the dictionaries go_frequency_dict = xlib.NestedDefaultDict() go_per_seq_dict = xlib.NestedDefaultDict() seq_per_go_dict = xlib.NestedDefaultDict() # open the annotation file if annotation_file.endswith('.gz'): try: annotation_file_id = gzip.open(annotation_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', annotation_file) else: try: annotation_file_id = open(annotation_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', annotation_file) # initialize the annotation counter annotation_counter = 0 # read the first record of the annotation file (header) annotation_file_id.readline() # read the next record of the annotation file (first data record) (record, key, data_dict) = xlib.read_trapid_annotation_record(annotation_file, annotation_file_id, annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') # while there are records while record != '': # add 1 to the annotation counter annotation_counter += 1 # increase the GO term counter in the go term frequency dictionary frequency = go_frequency_dict.get(data_dict['go'], 0) go_frequency_dict[data_dict['go']] = frequency + 1 # add GO term identification in the go term per sequence dictionary seq_go_list = go_per_seq_dict.get(data_dict['transcript_id'], []) if data_dict['go'] not in seq_go_list: seq_go_list.append(data_dict['go']) go_per_seq_dict[data_dict['transcript_id']] = seq_go_list # add sequence identication in the sequences per GO term dictionary go_seq_list = seq_per_go_dict.get(data_dict['go'], []) if data_dict['transcript_id'] not in go_seq_list: go_seq_list.append(data_dict['transcript_id']) seq_per_go_dict[data_dict['go']] = go_seq_list xlib.Message.print( 'verbose', f'\rAnnotation file: {annotation_counter} processed records') # read the next record of the annotation file (record, key, data_dict) = xlib.read_trapid_annotation_record( annotation_file, annotation_file_id, annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') xlib.Message.print('verbose', '\n') # print summary xlib.Message.print( 'info', f'{annotation_counter} annotation records in annotation file.'.format( )) # close annotation file annotation_file_id.close() # write the GO term frequency go_frequency_file = f'{output_dir}/trapid-go-frequency.csv' write_go_frequency(go_frequency_dict, go_ontology_dict, go_frequency_file) xlib.Message.print( 'info', f'The file {os.path.basename(go_frequency_file)} is generated.') # write go terms per sequence go_per_seq_file = f'{output_dir}/trapid-go-per-seq.csv' write_go_per_seq(go_per_seq_dict, go_ontology_dict, go_per_seq_file) xlib.Message.print( 'info', f'The file {os.path.basename(go_per_seq_file)} is generated.') # write sequence identification per go term seq_per_go_file = f'{output_dir}/trapid-seq-per-go.csv' write_seq_per_go(seq_per_go_dict, go_ontology_dict, seq_per_go_file) xlib.Message.print( 'info', f'The file {os.path.basename(seq_per_go_file)} is generated.')
def calculate_trinotate_go_stats(annotation_file, go_ontology_dict, output_dir): ''' Calculate GO term statistics of a Trinotate annotation file. ''' # initialize the dictionaries blastx_go_frequency_dict = xlib.NestedDefaultDict() blastx_go_per_seq_dict = xlib.NestedDefaultDict() blastx_seq_per_go_dict = xlib.NestedDefaultDict() blastp_go_frequency_dict = xlib.NestedDefaultDict() blastp_go_per_seq_dict = xlib.NestedDefaultDict() blastp_seq_per_go_dict = xlib.NestedDefaultDict() # open the annotation file if annotation_file.endswith('.gz'): try: annotation_file_id = gzip.open(annotation_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', annotation_file) else: try: annotation_file_id = open(annotation_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', annotation_file) # initialize the annotation counter annotation_counter = 0 # read the first record of the annotation file (header) annotation_file_id.readline() # read the next record of the annotation file (first data record) (record, key, data_dict) = xlib.read_trinotate_annotation_record( annotation_file, annotation_file_id, annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') # while there are records while record != '': # add 1 to the annotation counter annotation_counter += 1 # extract blastx GO term identifications and add them into the GO identification list # gene_ontology_blastx format: GO:id1^aspect1^desc1`GO:id2^aspect2^desc2`...`GO:idn^aspectn^descn # aspect values: biological process (P), molecular function (F), cellular component (C) blastx_go_id_list = [] if data_dict['gene_ontology_blastx'] != '.': go_data_list = data_dict['gene_ontology_blastx'].split(r'`') for go_data in go_data_list: (go_id, go_aspect, go_des) = go_data.split('^') blastx_go_id_list.append(go_id) # increase the GO term counter in the blastx go term frequency dictionary for i in range(len(blastx_go_id_list)): go_id = blastx_go_id_list[i] frequency = blastx_go_frequency_dict.get(go_id, 0) blastx_go_frequency_dict[go_id] = frequency + 1 # add GO term identifications in the blastx go terms per sequence dictionary seq_go_list = blastx_go_per_seq_dict.get(data_dict['transcript_id'], []) for go_id in blastx_go_id_list: if go_id not in seq_go_list: seq_go_list.append(go_id) blastx_go_per_seq_dict[data_dict['transcript_id']] = seq_go_list # add sequence identication in the blastx sequences per GO term dictionary for go_id in blastx_go_id_list: go_seq_list = blastx_seq_per_go_dict.get(go_id, []) if data_dict['transcript_id'] not in go_seq_list: go_seq_list.append(data_dict['transcript_id']) blastx_seq_per_go_dict[go_id] = go_seq_list # extract blastp GO term identifications and add them into the GO identification list # gene_ontology_blastp format: GO:id1^aspect1^desc1`GO:id2^aspect2^desc2`...`GO:idn^aspectn^descn # aspect values: biological process (P), molecular function (F), cellular component (C) blastp_go_id_list = [] if data_dict['gene_ontology_blastp'] != '.': go_data_list = data_dict['gene_ontology_blastp'].split(r'`') for go_data in go_data_list: (go_id, go_aspect, go_des) = go_data.split('^') blastp_go_id_list.append(go_id) # increase the GO term counter in the blastp go term frequency dictionary for i in range(len(blastp_go_id_list)): go_id = blastp_go_id_list[i] frequency = blastp_go_frequency_dict.get(go_id, 0) blastp_go_frequency_dict[go_id] = frequency + 1 # add GO term identifications in the blastp go terms per sequence dictionary seq_go_list = blastp_go_per_seq_dict.get(data_dict['transcript_id'], []) for go_id in blastp_go_id_list: if go_id not in seq_go_list: seq_go_list.append(go_id) blastp_go_per_seq_dict[data_dict['transcript_id']] = seq_go_list # add sequence identication in the blastp sequences per GO term dictionary for go_id in blastp_go_id_list: go_seq_list = blastp_seq_per_go_dict.get(go_id, []) if data_dict['transcript_id'] not in go_seq_list: go_seq_list.append(data_dict['transcript_id']) blastp_seq_per_go_dict[go_id] = go_seq_list xlib.Message.print( 'verbose', f'\rAnnotation file: {annotation_counter} processed records') # read the next record of the annotation file (record, key, data_dict) = xlib.read_trinotate_annotation_record( annotation_file, annotation_file_id, annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') xlib.Message.print('verbose', '\n') # print summary xlib.Message.print( 'info', f'{annotation_counter} annotation records in annotation file.') # close annotation file annotation_file_id.close() # write the GO term frequency blastx_go_frequency_file = f'{output_dir}/trinotate-blastx-go-frequency.csv' write_go_frequency(blastx_go_frequency_dict, go_ontology_dict, blastx_go_frequency_file) xlib.Message.print( 'info', f'The file {os.path.basename(blastx_go_frequency_file)} is generated.') blastp_go_frequency_file = f'{output_dir}/trinotate-blastp-go-frequency.csv' write_go_frequency(blastp_go_frequency_dict, go_ontology_dict, blastp_go_frequency_file) xlib.Message.print( 'info', f'The file {os.path.basename(blastp_go_frequency_file)} is generated.') # write go terms per sequence blastx_go_per_seq_file = f'{output_dir}/trinotate-blastx-go-per-seq.csv' write_go_per_seq(blastx_go_per_seq_dict, go_ontology_dict, blastx_go_per_seq_file) xlib.Message.print( 'info', f'The file {os.path.basename(blastx_go_per_seq_file)} is generated.') blastp_go_per_seq_file = f'{output_dir}/trinotate-blastp-go-per-seq.csv' write_go_per_seq(blastp_go_per_seq_dict, go_ontology_dict, blastp_go_per_seq_file) xlib.Message.print( 'info', f'The file {os.path.basename(blastp_go_per_seq_file)} is generated.') # write sequence identification per go term blastx_seq_per_go_file = f'{output_dir}/trinotate-blastx-seq-per-go.csv' write_seq_per_go(blastx_seq_per_go_dict, go_ontology_dict, blastx_seq_per_go_file) xlib.Message.print( 'info', f'The file {os.path.basename(blastx_seq_per_go_file)} is generated.') blastp_seq_per_go_file = f'{output_dir}/trinotate-blastp-seq-per-go.csv' write_seq_per_go(blastp_seq_per_go_dict, go_ontology_dict, blastp_seq_per_go_file) xlib.Message.print( 'info', f'The file {os.path.basename(blastp_seq_per_go_file)} is generated.')
def calculate_toa_go_stats(annotation_file, go_ontology_dict, output_dir): ''' Calculate GO term statistics of a TOA annotation file (only the sequence with less e-Value is considered). ''' # initialize the statistics dictionaries go_frequency_dict = xlib.NestedDefaultDict() go_per_seq_dict = xlib.NestedDefaultDict() seq_per_go_dict = xlib.NestedDefaultDict() # open the annotation file if annotation_file.endswith('.gz'): try: annotation_file_id = gzip.open(annotation_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', annotation_file) else: try: annotation_file_id = open(annotation_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', annotation_file) # initialize the annotation counter annotation_counter = 0 # read the first record of the annotation file (header) annotation_file_id.readline() # read the secord record of the annotation file (first data record) (record, key, data_dict) = xlib.read_toa_annotation_record(annotation_file, annotation_file_id, 'MERGER', annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') # while there are records while record != '': # initialize the old sequence identification old_nt_seq_id = data_dict['nt_seq_id'] # initialize the minimum e-value min_evalue = 9999 # while there are records and the same sequence identification while record != '' and data_dict['nt_seq_id'] == old_nt_seq_id: # add 1 to the annotation counter annotation_counter += 1 # extract the GO term identifications and add them into the GO identification list # go_id format: "GO:id1*id2*...*idn" if data_dict['go_id'] != '': go_id_list = data_dict['go_id'][3:].split('*') else: go_id_list = [] # save the go identification list of the sequence hit/hsp with less e-value if float(data_dict['hsp_evalue']) < min_evalue: min_evalue_go_id_list = go_id_list xlib.Message.print( 'verbose', f'\rAnnotation file: {annotation_counter} processed records') # read the next record of the annotation file (record, key, data_dict) = xlib.read_toa_annotation_record( annotation_file, annotation_file_id, 'MERGER', annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') # increase the GO term counter in the go term frequency dictionary for i in range(len(min_evalue_go_id_list)): go_id = f'GO:{min_evalue_go_id_list[i]}' frequency = go_frequency_dict.get(go_id, 0) go_frequency_dict[go_id] = frequency + 1 # add GO term identifications in the blastx go terms per sequence dictionary seq_go_list = go_per_seq_dict.get(old_nt_seq_id, []) for i in range(len(min_evalue_go_id_list)): go_id = f'GO:{min_evalue_go_id_list[i]}' if go_id not in seq_go_list: seq_go_list.append(go_id) go_per_seq_dict[old_nt_seq_id] = seq_go_list # add sequence identication in the blastx sequences per GO term dictionary for i in range(len(min_evalue_go_id_list)): go_id = f'GO:{min_evalue_go_id_list[i]}' go_seq_list = seq_per_go_dict.get(go_id, []) if old_nt_seq_id not in go_seq_list: go_seq_list.append(old_nt_seq_id) seq_per_go_dict[go_id] = go_seq_list xlib.Message.print('verbose', '\n') # print summary xlib.Message.print( 'info', f'{annotation_counter} annotation records in annotation file.') # close annotation file annotation_file_id.close() # write the GO term frequency go_frequency_file = f'{output_dir}/toa-go-frequency.csv' write_go_frequency(go_frequency_dict, go_ontology_dict, go_frequency_file) xlib.Message.print( 'info', f'The file {os.path.basename(go_frequency_file)} is generated.') # write go terms per sequence go_per_seq_file = f'{output_dir}/toa-go-per-seq.csv' write_go_per_seq(go_per_seq_dict, go_ontology_dict, go_per_seq_file) xlib.Message.print( 'info', f'The file {os.path.basename(go_per_seq_file)} is generated.') # write sequence identification per go term seq_per_go_file = f'{output_dir}/toa-seq-per-go.csv' write_seq_per_go(seq_per_go_dict, go_ontology_dict, seq_per_go_file) xlib.Message.print( 'info', f'The file {os.path.basename(seq_per_go_file)} is generated.')
def calculate_entap_go_stats(annotation_file, go_ontology_dict, output_dir): ''' Calculate GO term statistics of a EnTAP annotation file. ''' # initialize the dictionaries go_frequency_dict = xlib.NestedDefaultDict() go_per_seq_dict = xlib.NestedDefaultDict() seq_per_go_dict = xlib.NestedDefaultDict() # open the annotation file if annotation_file.endswith('.gz'): try: annotation_file_id = gzip.open(annotation_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', annotation_file) else: try: annotation_file_id = open(annotation_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', annotation_file) # initialize the annotation counter annotation_counter = 0 # read the first record of the annotation file (header) annotation_file_id.readline() # read the next record of the annotation file (first data record) (record, key, data_dict) = xlib.read_entap_annotation_record(annotation_file, annotation_file_id, annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') # while there are records while record != '': # add 1 to the annotation counter annotation_counter += 1 # extract biological GO term identifications and add them into the GO identification list # go_biological format: "GO:id1-desc1,GO:id2-desc2,...,GO:idn-descn" go_id_list_1 = [] if data_dict['go_biological'] != '': seq_go_data_list_1 = data_dict['go_biological'].split(',') for go_data in seq_go_data_list_1: if go_data.strip().startswith('GO:'): go_id_list_1.append(go_data[:10]) # extract cellular GO terms identifications and add them into the GO identification list # go_cellular format: "GO:id1-desc1,GO:id2-desc2,...,GO:idn-descn" go_id_list_2 = [] if data_dict['go_cellular'] != '': seq_go_data_list_2 = data_dict['go_cellular'].split(',') for go_data in seq_go_data_list_2: if go_data.strip().startswith('GO:'): go_id_list_2.append(go_data[:10]) # extract molecular GO term identifications and add them into the GO identification list # go_molecular format: "GO:id1-desc1,GO:id2-desc2,...,GO:idn-descn" go_id_list_3 = [] if data_dict['go_molecular'] != '': seq_go_data_list_3 = data_dict['go_molecular'].split(',') for go_data in seq_go_data_list_3: if go_data.strip().startswith('GO:'): go_id_list_3.append(go_data[:10]) # concat GO identification lists go_id_list = go_id_list_1 + go_id_list_2 + go_id_list_3 # increase the GO term counters in the go term frequency dictionary for i in range(len(go_id_list)): go_id = go_id_list[i] counter = go_frequency_dict.get(go_id, 0) go_frequency_dict[go_id] = counter + 1 # add GO term identifications in the go term per sequence dictionary seq_go_list = go_per_seq_dict.get(data_dict['query_sequence'], []) for go_id in go_id_list: if go_id not in seq_go_list: seq_go_list.append(go_id) go_per_seq_dict[data_dict['query_sequence']] = seq_go_list # add sequence identication in the sequences per GO term dictionary for go_id in go_id_list: go_seq_list = seq_per_go_dict.get(go_id, []) if data_dict['query_sequence'] not in go_seq_list: go_seq_list.append(data_dict['query_sequence']) seq_per_go_dict[go_id] = go_seq_list xlib.Message.print( 'verbose', f'\rAnnotation file: {annotation_counter} processed records') # read the next record of the annotation file (record, key, data_dict) = xlib.read_entap_annotation_record( annotation_file, annotation_file_id, annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') xlib.Message.print('verbose', '\n') # print summary xlib.Message.print( 'info', f'{annotation_counter} annotation records in annotation file.') # close annotation file annotation_file_id.close() # write the GO term frequency go_frequency_file = f'{output_dir}/entap-go-frequency.csv' write_go_frequency(go_frequency_dict, go_ontology_dict, go_frequency_file) xlib.Message.print( 'info', f'The file {os.path.basename(go_frequency_file)} is generated.') # write go terms per sequence go_per_seq_file = f'{output_dir}/entap-go-per-seq.csv' write_go_per_seq(go_per_seq_dict, go_ontology_dict, go_per_seq_file) xlib.Message.print( 'info', f'The file {os.path.basename(go_per_seq_file)} is generated.') # write sequence identification per go term seq_per_go_file = f'{output_dir}/entap-seq-per-go.csv' write_seq_per_go(seq_per_go_dict, go_ontology_dict, seq_per_go_file) xlib.Message.print( 'info', f'The file {os.path.basename(seq_per_go_file)} is generated.')
def calculate_blast2go_go_stats(annotation_file, go_ontology_dict, output_dir): ''' Calculate GO term statistics of a Blast2GO annotation file. ''' # initialize the dictionaries go_frequency_dict = xlib.NestedDefaultDict() go_per_seq_dict = xlib.NestedDefaultDict() seq_per_go_dict = xlib.NestedDefaultDict() # open the annotation file if annotation_file.endswith('.gz'): try: annotation_file_id = gzip.open(annotation_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', annotation_file) else: try: annotation_file_id = open(annotation_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', annotation_file) # initialize the annotation counter annotation_counter = 0 # read the first record of the annotation file (header) annotation_file_id.readline() # read the next record of the annotation file (first data record) (record, key, data_dict) = xlib.read_blast2go_annotation_record(annotation_file, annotation_file_id, annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') # while there are records while record != '': # add 1 to the annotation counter annotation_counter += 1 # extract GO term identifications and add them into the GO identification list # go_ids format: "aspect1:GO:id1;aspect2:GO:id2;...;aspectn:GO:idn" # aspect values values: P (biological process), F (molecular function), C (cellular component) go_id_list_1 = [] if data_dict['go_ids'] != '': seq_go_id_list = data_dict['go_ids'].split(';') for i in range(len(seq_go_id_list)): go_id_list_1.append(seq_go_id_list[i].strip()[2:]) # extract InterPro GO term identifications and add them into the GO identification list # interpro_go_ids format: "aspect1:GO:id1;aspect2:GO:id2;...;aspectn:GO:idn" # aspect values values: P (biological process), F (molecular function), C (cellular component) go_id_list_2 = [] if data_dict['interpro_go_ids'] not in [ '', 'no GO terms', 'no IPS match' ]: seq_go_id_list = data_dict['interpro_go_ids'].split(';') for i in range(len(seq_go_id_list)): go_id_list_2.append(seq_go_id_list[i].strip()[2:]) # concat GO identification lists go_id_list = go_id_list_1 + go_id_list_2 # increase the GO term counters in the go term frequency dictionary for i in range(len(go_id_list)): go_id = go_id_list[i] counter = go_frequency_dict.get(go_id, 0) go_frequency_dict[go_id] = counter + 1 # add GO term identifications in the go terms per sequence dictionary seq_go_list = go_per_seq_dict.get(data_dict['seq_name'], []) for go_id in go_id_list: if go_id not in seq_go_list: seq_go_list.append(go_id) go_per_seq_dict[data_dict['seq_name']] = seq_go_list # add sequence identication in the sequences per GO term dictionary for go_id in go_id_list: go_seq_list = seq_per_go_dict.get(go_id, []) if data_dict['seq_name'] not in go_seq_list: go_seq_list.append(data_dict['seq_name']) seq_per_go_dict[go_id] = go_seq_list xlib.Message.print( 'verbose', f'\rAnnotation file: {annotation_counter} processed records') # read the next record of the annotation file (record, key, data_dict) = xlib.read_blast2go_annotation_record( annotation_file, annotation_file_id, annotation_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') xlib.Message.print('verbose', '\n') # print summary xlib.Message.print( 'info', f'{annotation_counter} annotation records in annotation file.') # close annotation file annotation_file_id.close() # write the GO term frequency go_frequency_file = f'{output_dir}/blast2go-go-frequency.csv' write_go_frequency(go_frequency_dict, go_ontology_dict, go_frequency_file) xlib.Message.print( 'info', f'The file {os.path.basename(go_frequency_file)} is generated.') # write go terms per sequence go_per_seq_file = f'{output_dir}/blast2go-go-per-seq.csv' write_go_per_seq(go_per_seq_dict, go_ontology_dict, go_per_seq_file) xlib.Message.print( 'info', f'The file {os.path.basename(go_per_seq_file)} is generated.') # write sequence identification per go term seq_per_go_file = f'{output_dir}/blast2go-seq-per-go.csv' write_seq_per_go(seq_per_go_dict, go_ontology_dict, seq_per_go_file) xlib.Message.print( 'info', f'The file {os.path.basename(seq_per_go_file)} is generated.')
def query_data(conn, file_name, sp1_id, sp2_id, hybrid_id, imputed_md_id, max_separation, output_dir, tsi_list): ''' List data of variants and alleles and variant identifications to the scenario X. ''' # check if the table "gene_info" is loaded xlib.Message.print('verbose', 'Checking the table "gene_info" ...\n') if xsqlite.check_gene_info(conn) == 0: raise xlib.ProgramException('', 'B003', 'gene_info') xlib.Message.print('verbose', 'The table is loaded.\n') # check if the table "genomic_features" is loaded xlib.Message.print('verbose', 'Checking the table "genomic_features" ...\n') if xsqlite.check_genomic_features(conn) == 0: raise xlib.ProgramException('', 'B003', 'genomic_features') xlib.Message.print('verbose', 'The table is loaded.\n') # check if the table "vcf_samples" is loaded xlib.Message.print('verbose', 'Checking the table "vcf_samples" ...\n') if xsqlite.check_vcf_samples(conn) == 0: raise xlib.ProgramException('', 'B003', 'vcf_samples') xlib.Message.print('verbose', 'The table is loaded.\n') # check if the table "vcf_variants" is loaded xlib.Message.print('verbose', 'Checking the table "vcf_variants" ...\n') if xsqlite.check_vcf_variants(conn) == 0: raise xlib.ProgramException('', 'B003', 'vcf_variants') xlib.Message.print('verbose', 'The table is loaded.\n') # check if the table "vcf_alleles" is loaded xlib.Message.print('verbose', 'Checking the table "vcf_alleles" ...\n') if xsqlite.check_vcf_alleles(conn) == 0: raise xlib.ProgramException('', 'B003', 'vcf_alleles') xlib.Message.print('verbose', 'The table is loaded.\n') # check if the table "vcf_samples_alleles" is loaded xlib.Message.print('verbose', 'Checking the table "vcf_samples_alleles" ...\n') if xsqlite.check_vcf_samples_alleles(conn) == 0: raise xlib.ProgramException('', 'B003', 'vcf_samples_alleles') xlib.Message.print('verbose', 'The table is loaded.\n') # get the variant dictionary xlib.Message.print('verbose', 'Getting variant data ...\n') variant_dict = xsqlite.query_variants(conn) xlib.Message.print('verbose', 'Data are got.\n') # get the allele dictionary xlib.Message.print('verbose', 'Getting allele data ...\n') allele_dict = xsqlite.get_vcf_alleles_dict(conn) xlib.Message.print('verbose', 'Data are got.\n') # get the imputated allele dictionary xlib.Message.print('verbose', 'Getting imputated allele data ...\n') imputed_allele_dict = xsqlite.query_imputed_alleles(conn, imputed_md_id) xlib.Message.print('verbose', 'Data are got.\n') # get the dictionary of allele frecuency per species xlib.Message.print( 'verbose', 'Getting the dictionary of allele frecuency per species ...\n') species_allele_frequency_dict = xsqlite.query_species_allele_frequencies( conn, xlib.get_md_symbol()) xlib.Message.print('verbose', 'Data are got.\n') # get the dictionary of allele frecuency per species xlib.Message.print( 'verbose', 'Getting the dictionary of allele frecuency per species and type ...\n' ) species_and_type_allele_frequency_dict = xsqlite.query_species_and_type_allele_frequencies( conn, xlib.get_md_symbol()) xlib.Message.print('verbose', 'Data are got.\n') #------------------------------------------------------------------------------- # build the intergenic variant dictionary #------------------------------------------------------------------------------- xlib.Message.print('verbose', 'Building the intergenic variant dictionary ...\n') # initialize intergenic variant dictionary intergenic_variant_dict = xlib.NestedDefaultDict() # initialize the current item i = 0 # while there are items in the variant dictionary while i < len(variant_dict): # initialize data variant_id = variant_dict[i]['variant_id'] seq_id = '' position = 0 found_gene = False found_exon = False # while there are items in the variant dictionary and the items have the same variant identification while i < len( variant_dict) and variant_id == variant_dict[i]['variant_id']: # save data variant_id = variant_dict[i]['variant_id'] seq_id = variant_dict[i]['seq_id'] position = variant_dict[i]['position'] gene = variant_dict[i]['gene'] # next item i += 1 # add item to the intergenic variant dictionary if gene == 'N/A': intergenic_variant_dict[seq_id][position] = { 'variant_id': variant_id } xlib.Message.print('verbose', 'Dictionary is built.\n') #------------------------------------------------------------------------------- # build the intergenic fragment dictionary #------------------------------------------------------------------------------- xlib.Message.print('verbose', 'Building the intergenic fragment dictionary ...\n') # initialize the fragment dictionary fragment_dict = xlib.NestedDefaultDict() # for each sequence identification in the intergenic variant dictionary for seq_id in sorted(intergenic_variant_dict.keys()): if seq_id in tsi_list: xlib.Message.print('trace', f'seq_id: {seq_id}') if seq_id in tsi_list: xlib.Message.print( 'trace', f'intergenic_variant_dict[seq_id]: {intergenic_variant_dict[seq_id]}' ) # initialize control variable for the first variant in the sequence first_variant = True # initialize the fragment number fragment_num = 0 # for each position in the sequence for position in sorted(intergenic_variant_dict[seq_id]): # first variant in the sequence if first_variant: first_variant = False variant_id = intergenic_variant_dict[seq_id][position][ 'variant_id'] fragment_id = f'{seq_id}-F{fragment_num:03d}' fragment_dict[variant_id] = {'fragment_id': fragment_id} old_position = position # the following variants in the sequence else: # when the position is less or equal to the maximum separation between variants of the same intergenic fragment if position <= old_position + max_separation: variant_id = intergenic_variant_dict[seq_id][position][ 'variant_id'] fragment_id = f'{seq_id}-F{fragment_num:03d}' fragment_dict[variant_id] = {'fragment_id': fragment_id} # when the position is greater to the maximum separation between variants of the same intergenic fragment else: fragment_num += 1 variant_id = intergenic_variant_dict[seq_id][position][ 'variant_id'] fragment_id = f'{seq_id}-F{fragment_num:03d}' fragment_dict[variant_id] = {'fragment_id': fragment_id} old_position = position xlib.Message.print('verbose', 'Dictionary is built.\n') #------------------------------------------------------------------------------- # Create the variant file #------------------------------------------------------------------------------- xlib.Message.print('verbose', 'Writting the variant file ...\n') # initialize the imputation dictionary imputation_dict = xlib.NestedDefaultDict() # open the output variant file variant_file = f'{output_dir}/{file_name}-data2scenarioX-variants.csv' try: variant_file_id = open(variant_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', variant_file) # write head record of the output variant file variant_file_id.write( '"variant_id";"seq_id";"position";"genomic_zone";"gene/fragment";"description";"chromosome_id";"imputations"\n' ) # initialize the current item i = 0 # while there are items in the variant dictionary while i < len(variant_dict): # initialize data variant_id = variant_dict[i]['variant_id'] found_gene = False found_exon = False # while there are items in the variant dictionary and the items have the same variant identification while i < len( variant_dict) and variant_id == variant_dict[i]['variant_id']: # save data seq_id = variant_dict[i]['seq_id'] position = variant_dict[i]['position'] start = variant_dict[i]['start'] end = variant_dict[i]['end'] gene = variant_dict[i]['gene'] description = variant_dict[i]['description'] if description == None: description = 'N/A' chromosome_id = variant_dict[i]['chromosome_id'] if chromosome_id == None: chromosome_id = 'N/A' if variant_dict[i]['gene'] != 'N/A': gene_or_fragment = variant_dict[i]['gene'] else: gene_or_fragment = fragment_dict[variant_id]['fragment_id'] if variant_dict[i]['type'] in ['gene', 'pseudogene']: found_gene = True elif variant_dict[i]['type'] == 'exon': found_exon = True # next item i += 1 # set genomic_zone if end == 0: genomic_zone = 'N/A' elif not found_gene: genomic_zone = 'intergenic' elif found_exon: genomic_zone = 'exonic' else: genomic_zone = 'intronic' # set imputations if imputed_allele_dict.get(variant_id, 0) == 0: imputations = 'N' else: imputations = 'Y' # add variant dictionary to the gene dictionary imputation_dict[gene_or_fragment][variant_id] = { 'imputations': imputations } # write data variant_file_id.write( f'"{variant_id}";"{seq_id}";{position};"{genomic_zone}";"{gene_or_fragment}";"{description}";"{chromosome_id}";"{imputations}"\n' ) # print OK message xlib.Message.print( 'info', f'The file {os.path.basename(variant_file)} containing variant data is created.' ) # close the output variant file variant_file_id.close() #------------------------------------------------------------------------------- # Create the allele file #------------------------------------------------------------------------------- xlib.Message.print('verbose', 'Writting the allele file ...\n') # open the output allele file allele_file = f'{output_dir}/{file_name}-data2scenarioX-alleles.csv' try: allele_file_id = open(allele_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', allele_file) # write head record of the output allele file allele_file_id.write( f'"variant_id";"seq_id";"position";"genomic_zone";"gene/fragment";"description";"chromosome_id";"imputations";"allele_id";"bases";"{sp1_id}_frequency";"{sp2_id}_frequency";"{hybrid_id}_frequency";"{sp1_id}_mothers_frequency";"{sp2_id}_mothers_frequency";"{hybrid_id}_mothers_frequency";"{sp1_id}_progenies_frequency";"{sp2_id}_progenies_frequency";"{hybrid_id}_progenies_frequency"\n' ) # initialize the current item i = 0 # while there are items in the variant dictionary while i < len(variant_dict): # initialize data variant_id = variant_dict[i]['variant_id'] found_gene = False found_exon = False if seq_id in tsi_list: xlib.Message.print('trace', f'variant_id: {variant_id}') # while there are items in the variant dictionary and the items have the same variant identification while i < len( variant_dict) and variant_id == variant_dict[i]['variant_id']: # save data seq_id = variant_dict[i]['seq_id'] position = variant_dict[i]['position'] start = variant_dict[i]['start'] end = variant_dict[i]['end'] gene = variant_dict[i]['gene'] description = variant_dict[i]['description'] if description == None: description = 'N/A' chromosome_id = variant_dict[i]['chromosome_id'] if chromosome_id == None: chromosome_id = 'N/A' if variant_dict[i]['gene'] != 'N/A': gene_or_fragment = variant_dict[i]['gene'] else: gene_or_fragment = fragment_dict[variant_id]['fragment_id'] if variant_dict[i]['type'] in ['gene', 'pseudogene']: found_gene = True elif variant_dict[i]['type'] == 'exon': found_exon = True # next item i += 1 # set genomic_zone if end == 0: genomic_zone = 'N/A' elif not found_gene: genomic_zone = 'intergenic' elif found_exon: genomic_zone = 'exonic' else: genomic_zone = 'intronic' # set imputations if imputed_allele_dict.get(variant_id, 0) == 0: imputations = 'N' else: imputations = 'Y' # build the frecuency summation dictionary of every species per allele species_frecuency_summation_dict = {} for species_id in species_allele_frequency_dict[variant_id].keys(): for allele_id in species_allele_frequency_dict[variant_id][ species_id].keys(): allele_data_dict = species_frecuency_summation_dict.get( allele_id, { sp1_id: 0, sp2_id: 0, hybrid_id: 0 }) allele_data_dict[species_id] += species_allele_frequency_dict[ variant_id][species_id][allele_id]['frecuency_sum'] species_frecuency_summation_dict[allele_id] = allele_data_dict if seq_id in tsi_list: xlib.Message.print( 'trace', f'species_frecuency_summation_dict: {species_frecuency_summation_dict}' ) # build the frecuency summation dictionary of every species and type per allele # species_and_type_allele_frequency_dict species_and_type_frecuency_summation_dict = {} for species_id in species_and_type_allele_frequency_dict[ variant_id].keys(): for type in species_and_type_allele_frequency_dict[variant_id][ species_id].keys(): for allele_id in species_and_type_allele_frequency_dict[ variant_id][species_id][type].keys(): allele_data_dict = species_and_type_frecuency_summation_dict.get( allele_id, { f'{sp1_id}_mothers': 0, f'{sp2_id}_mothers': 0, f'{hybrid_id}_mothers': 0, f'{sp1_id}_progenies': 0, f'{sp2_id}_progenies': 0, f'{hybrid_id}_progenies': 0 }) if type == 'MOTHER': data_id = f'{species_id}_mothers' elif type == 'PROGENY': data_id = f'{species_id}_progenies' allele_data_dict[ data_id] += species_and_type_allele_frequency_dict[ variant_id][species_id][type][allele_id][ 'frecuency_sum'] species_and_type_frecuency_summation_dict[ allele_id] = allele_data_dict if seq_id in tsi_list: xlib.Message.print( 'trace', f'species_and_type_frecuency_summation_dict: {species_and_type_frecuency_summation_dict}' ) # calculate the allelle frecuency totals per species allele_frecuency_total_sp1 = 0 allele_frecuency_total_sp2 = 0 allele_frecuency_total_hybrid = 0 for allele_id in species_frecuency_summation_dict.keys(): allele_frecuency_total_sp1 += species_frecuency_summation_dict.get( allele_id, {}).get(sp1_id, 0) allele_frecuency_total_sp2 += species_frecuency_summation_dict.get( allele_id, {}).get(sp2_id, 0) allele_frecuency_total_hybrid += species_frecuency_summation_dict.get( allele_id, {}).get(hybrid_id, 0) if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_sp1: {allele_frecuency_total_sp1}') if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_sp2: {allele_frecuency_total_sp2}') if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_hybrid: {allele_frecuency_total_hybrid}' ) # calculate the allelle frecuency totals per species and type allele_frecuency_total_sp1_mothers = 0 allele_frecuency_total_sp2_mothers = 0 allele_frecuency_total_hybrid_mothers = 0 allele_frecuency_total_sp1_progenies = 0 allele_frecuency_total_sp2_progenies = 0 allele_frecuency_total_hybrid_progenies = 0 for allele_id in species_frecuency_summation_dict.keys(): allele_frecuency_total_sp1_mothers += species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{sp1_id}_mothers', 0) allele_frecuency_total_sp2_mothers += species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{sp2_id}_mothers', 0) allele_frecuency_total_hybrid_mothers += species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{hybrid_id}_mothers', 0) allele_frecuency_total_sp1_progenies += species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{sp1_id}_progenies', 0) allele_frecuency_total_sp2_progenies += species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{sp2_id}_progenies', 0) allele_frecuency_total_hybrid_progenies += species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{hybrid_id}_progenies', 0) if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_sp1_mothers: {allele_frecuency_total_sp1_mothers}' ) if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_sp2_mothers: {allele_frecuency_total_sp2_mothers}' ) if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_hybrid_mothers: {allele_frecuency_total_hybrid_mothers}' ) if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_sp1_progenies: {allele_frecuency_total_sp1_progenies}' ) if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_sp2_progenies: {allele_frecuency_total_sp2_progenies}' ) if seq_id in tsi_list: xlib.Message.print( 'trace', f'allele_frecuency_total_hybrid_progenies: {allele_frecuency_total_hybrid_progenies}' ) # for each allele of the variant for allele_id in species_frecuency_summation_dict.keys(): # calculate the relative frequency of each specie per allele try: sp1_frequency = species_frecuency_summation_dict[allele_id][ sp1_id] / allele_frecuency_total_sp1 except: sp1_frequency = 'N/A' try: sp2_frequency = species_frecuency_summation_dict[allele_id][ sp2_id] / allele_frecuency_total_sp2 except: sp2_frequency = 'N/A' try: hybrid_frequency = species_frecuency_summation_dict[allele_id][ hybrid_id] / allele_frecuency_total_hybrid except: hybrid_frequency = 'N/A' # calculate the relative frequency of each specie and type per allele try: sp1_mothers_frequency = species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{sp1_id}_mothers', 0) / allele_frecuency_total_sp1_mothers except: sp1_mothers_frequency = 'N/A' try: sp2_mothers_frequency = species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{sp2_id}_mothers', 0) / allele_frecuency_total_sp2_mothers except: sp2_mothers_frequency = 'N/A' try: hybrid_mothers_frequency = species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{hybrid_id}_mothers', 0) / allele_frecuency_total_hybrid_mothers except: hybrid_mothers_frequency = 'N/A' try: sp1_progenies_frequency = species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{sp1_id}_progenies', 0) / allele_frecuency_total_sp1_progenies except: sp1_progenies_frequency = 'N/A' try: sp2_progenies_frequency = species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{sp2_id}_progenies', 0) / allele_frecuency_total_sp2_progenies except: sp2_progenies_frequency = 'N/A' try: hybrid_progenies_frequency = species_and_type_frecuency_summation_dict.get( allele_id, {}).get(f'{hybrid_id}_progenies', 0) / allele_frecuency_total_hybrid_progenies except: hybrid_progenies_frequency = 'N/A' # get bases sequence bases = allele_dict[variant_id][allele_id]['bases'] # write data variant identification allele_file_id.write( f'"{variant_id}";"{seq_id}";{position};"{genomic_zone}";"{gene_or_fragment}";"{description}";"{chromosome_id}";"{imputations}";"{allele_id}";"{bases}";"{sp1_frequency}";"{sp2_frequency}";"{hybrid_frequency}";"{sp1_mothers_frequency}";"{sp2_mothers_frequency}";"{hybrid_mothers_frequency}";"{sp1_progenies_frequency}";"{sp2_progenies_frequency}";"{hybrid_progenies_frequency}"\n' ) # print OK message xlib.Message.print( 'info', f'The file {os.path.basename(allele_file)} containing allele data is created.' ) # close the output allele file allele_file_id.close() #------------------------------------------------------------------------------- # Create the selected variant id file corresponding to the scenario X #------------------------------------------------------------------------------- xlib.Message.print( 'verbose', 'Writting the file with selected variant id corresponding to the scenario X...\n' ) # open the output file with variant ids corresponding to the scenario X selected_id_file = f'{output_dir}/{file_name}-data2scenarioX-selected_ids.txt' try: selected_id_file_id = open(selected_id_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', selected_id_file) # for every gene/fragment write the variant identifications corresponding to scenario X for gene_or_fragment in sorted(imputation_dict.keys()): # initialize control variables imputations_with_y = False imputations_with_n = False # check imputations of variants of this gene/fragment for variant_id in imputation_dict[gene_or_fragment].keys(): if imputation_dict[gene_or_fragment][variant_id][ 'imputations'] == 'Y': imputations_with_y = True else: imputations_with_n = True # write variant identitications for variant_id in (imputation_dict[gene_or_fragment].keys()): if imputations_with_y == True and imputations_with_n == False or imputation_dict[ gene_or_fragment][variant_id]['imputations'] == 'N': selected_id_file_id.write(f'{variant_id}\n') # print OK message xlib.Message.print( 'info', f'The file {os.path.basename(selected_id_file)} containing selected ids is created.' ) # close the output allele file selected_id_file_id.close()
def filter_ssrs(cos_file, ssr_file, output_file): ''' Filter a SSR file transcripts selecting SSRs included in a COS. ''' # initialize the contig dictionary contig_dict = xlib.NestedDefaultDict() # open the COS file if cos_file.endswith('.gz'): try: cos_file_id = gzip.open(cos_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', cos_file) else: try: cos_file_id = open(cos_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', cos_file) # initialize counters cos_record_counter = 0 cos_seq_counter = 0 # read the first record of COS file record = cos_file_id.readline() cos_record_counter += 1 # while there are records in COS file while record != '': # process the head record if record.startswith('>'): # add 1 to the COS sequences counter cos_seq_counter += 1 # extract head data head_data = record[1:].strip('\n') head_data_list = [] pos_1 = 0 for pos_2 in [ i for i, char in enumerate(head_data) if char == ':' ]: head_data_list.append(head_data[pos_1:pos_2]) pos_1 = pos_2 + 1 head_data_list.append(head_data[pos_1:].strip('\n')) try: contig_name = head_data_list[2] cos_star_end = head_data_list[3] pos_sep = head_data_list[3].find('-') cos_start = int(head_data_list[3][:pos_sep]) cos_end = int(head_data_list[3][pos_sep + 1:]) except Exception as e: raise xlib.ProgramException(e, 'F006', os.path.basename(cos_file), cos_record_counter) # initialize the COS sequence cos_seq = '' # read the next record record = cos_file_id.readline() cos_record_counter += 1 else: # control the FASTA format raise xlib.ProgramException('', 'F006', cos_file, 'FASTA') # while there are records and they are COS sequence while record != '' and not record.startswith('>'): # concatenate the record to the COS sequence cos_seq += record.strip() # read the next record of COS file record = cos_file_id.readline() cos_record_counter += 1 # add item in the COS dictionary # -- contig_dict[contig_id][cos_star_end] = {'cos_start': cos_start, 'cos_end': cos_end, 'cos_seq': cos_seq} contig_dict[contig_name][cos_star_end] = { 'cos_start': cos_start, 'cos_end': cos_end } # print the COST sequence counter xlib.Message.print('verbose', f'\rProcessed COS seqs ... {cos_seq_counter:8d}') xlib.Message.print('verbose', '\n') # close files cos_file_id.close() # open the input SSR file if ssr_file.endswith('.gz'): try: ssr_file_id = gzip.open(ssr_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', ssr_file) else: try: ssr_file_id = open(ssr_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', ssr_file) # open the ouput SSR file if output_file.endswith('.gz'): try: output_file_id = gzip.open(output_file, mode='wt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', output_file) else: try: output_file_id = open(output_file, mode='w', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', output_file) # initialize counters input_record_counter = 0 output_record_counter = 0 # read the first record of SSR file record = ssr_file_id.readline() input_record_counter += 1 # while there are records in input SSR file while record != '': # when record is the head if input_record_counter == 1: output_file_id.write(record) output_record_counter += 1 # when record is not the head else: # extract SSR data ssr_data = record[1:].strip('\n') ssr_data_list = [] pos_1 = 0 for pos_2 in [ i for i, char in enumerate(ssr_data) if char == '\t' ]: ssr_data_list.append(ssr_data[pos_1:pos_2]) pos_1 = pos_2 + 1 ssr_data_list.append(ssr_data[pos_1:].strip('\n')) try: contig_name = ssr_data_list[0][:ssr_data_list[0].find(' ')] ssr_start = int(ssr_data_list[2]) ssr_end = int(ssr_data_list[3]) except Exception as e: raise xlib.ProgramException(e, 'F006', os.path.basename(cos_file), cos_record_counter) # get COS data of the contig cos_data_dict = contig_dict[contig_name] # write the SSR when it is into a COS for _, cos_data_dict in cos_data_dict.items(): cos_start = cos_data_dict['cos_start'] cos_end = cos_data_dict['cos_end'] if ssr_start <= cos_end and ssr_end >= cos_start: output_file_id.write(record) output_record_counter += 1 break # print the COST sequence counter xlib.Message.print( 'verbose', f'\rInput records ... {input_record_counter:8d} - Output records ... {output_record_counter:8d}' ) # read the next record of SSR file record = ssr_file_id.readline() input_record_counter += 1 xlib.Message.print('verbose', '\n') # close files ssr_file_id.close() output_file_id.close() # print OK message print( f'\nThe file {os.path.basename(output_file)} containing the selected SSRs is created.' )