Esempio n. 1
0
def get_sample_type_dict(conn):
    '''
    Get a dictionary with the type corresponding to samples.
    '''

    # initialize the dictionary
    sample_type_dict = xlib.NestedDefaultDict()

    # query
    sentence = '''
               SELECT a.sample_id, 'PROGENY' "type"
                  FROM vcf_samples a
                  WHERE a.mother_id != 'NONE'
               UNION
               SELECT b.sample_id, 'MOTHER' "type"
                  FROM vcf_samples b
                  WHERE b.sample_id IN (SELECT DISTINCT c.mother_id FROM  vcf_samples c)
               UNION
               SELECT d.sample_id, 'ADULT' "type"
                  FROM vcf_samples d
                  WHERE d.mother_id == 'NONE'
                    AND d.sample_id NOT IN (SELECT DISTINCT e.mother_id FROM  vcf_samples e)
               ORDER BY 1;
               '''
    try:
        rows = conn.execute(sentence)
    except Exception as e:
        raise xlib.ProgramException(e, 'B002', sentence, conn)

    # add row data to the dictionary
    for row in rows:
        sample_type_dict[row[0]] = {'sample_id': row[0], 'type': row[1]}

    # return the dictionary
    return sample_type_dict
Esempio n. 2
0
def get_vcf_alleles_dict(conn):
    '''
    Get a dictionary corresponding to rows of allele data.
    '''

    # initialize the dictionary
    allele_dict = xlib.NestedDefaultDict()

    # query
    sentence = f'''
                select variant_id, allele_id, bases, structure_allele_id
                    from vcf_alleles;
                '''
    try:
        rows = conn.execute(sentence)
    except Exception as e:
        raise xlib.ProgramException(e, 'B002', sentence, conn)

    # add row data to the dictionary
    for row in rows:
        allele_dict[row[0]][row[1]] = {
            'bases': row[2],
            'structure_allele_id': row[3]
        }

    # return the dictionary
    return allele_dict
Esempio n. 3
0
def query_species_and_type_allele_frequencies(conn, md_symbol):
    '''
    Get a dictionary corresponding to rows of individual allele frequencies per species and type of variant per species (alleles with missing data and adult individuals are not considered).
    '''

    # initialize the dictionary
    species_and_type_allele_frequency_dict = xlib.NestedDefaultDict()

    # query
    sentence = f'''
                SELECT a.variant_id, b.species_id, b.type, a.allele_id, SUM(a.frecuency)
                    FROM vcf_samples_alleles a, vcf_samples b
                    WHERE a.sample_id = b.sample_id
                      AND a.allele_id <> '{md_symbol}'
                      AND b.type <> 'ADULT'
                    GROUP BY a.variant_id, b.species_id, b.type, a.allele_id
                    ORDER BY a.variant_id, b.species_id, b.type, a.allele_id;
                '''
    try:
        rows = conn.execute(sentence)
    except Exception as e:
        raise xlib.ProgramException(e, 'B002', sentence, conn)

    # add row data to the dictionary
    for row in rows:
        species_and_type_allele_frequency_dict[row[0]][row[1]][row[2]][
            row[3]] = {
                'frecuency_sum': row[4]
            }

    # return the dictionary
    return species_and_type_allele_frequency_dict
Esempio n. 4
0
def build_trapid_annotation(transcripts_with_go_file, transcripts_with_gf_file, transcripts_with_ko_file, annotation_file):
    '''
    Build functional annotation data corresponding to a TRAPID run.
    '''

    # initialize the annotation dictionary
    annotation_dict = xlib.NestedDefaultDict()

    # get GO annotations
    annotation_dict = get_go_annotations(transcripts_with_go_file, annotation_dict)

    # get GF annotations
    annotation_dict = get_gf_annotations(transcripts_with_gf_file, annotation_dict)

    # get KO annotations
    annotation_dict = get_ko_annotations(transcripts_with_ko_file, annotation_dict)

    # open the output annotation file
    if annotation_file.endswith('.gz'):
        try:
            annotation_file_id = gzip.open(annotation_file, mode='wt', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', annotation_file)
    else:
        try:
            annotation_file_id = open(annotation_file, mode='w', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', annotation_file)

    # write header record
    annotation_file_id.write('"transcript_id";"go_id";"go_desc";"gf_id";"ko_id";"ko_desc"\n')

    ## write transcript records
    for transcript_id in sorted(annotation_dict.keys()):
        go_id = annotation_dict.get(transcript_id, {}).get('go_id', '')
        go_desc = annotation_dict.get(transcript_id, {}).get('go_desc', '')
        gf_id = annotation_dict.get(transcript_id, {}).get('gf_id', '')
        ko_id = annotation_dict.get(transcript_id, {}).get('ko_id', '')
        ko_desc = annotation_dict.get(transcript_id, {}).get('ko_desc', '')
        annotation_file_id.write(f'"{transcript_id}";"{go_id}";"{go_desc}";"{gf_id}";"{ko_id}";"{ko_desc}"\n')

    # close annotation file
    annotation_file_id.close()
Esempio n. 5
0
def calculate_trapid_go_stats(annotation_file, go_ontology_dict, output_dir):
    '''
    Calculate GO term statistics of a TRAPID annotation file.
    '''

    # initialize the dictionaries
    go_frequency_dict = xlib.NestedDefaultDict()
    go_per_seq_dict = xlib.NestedDefaultDict()
    seq_per_go_dict = xlib.NestedDefaultDict()

    # open the annotation file
    if annotation_file.endswith('.gz'):
        try:
            annotation_file_id = gzip.open(annotation_file,
                                           mode='rt',
                                           encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', annotation_file)
    else:
        try:
            annotation_file_id = open(annotation_file,
                                      mode='r',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', annotation_file)

    # initialize the annotation counter
    annotation_counter = 0

    # read the first record of the annotation file (header)
    annotation_file_id.readline()

    # read the next record of the annotation file (first data record)
    (record, key,
     data_dict) = xlib.read_trapid_annotation_record(annotation_file,
                                                     annotation_file_id,
                                                     annotation_counter)
    xlib.Message.print('trace', f'key: {key} - record: {record}')

    # while there are records
    while record != '':

        # add 1 to the annotation counter
        annotation_counter += 1

        # increase the GO term counter in the go term frequency dictionary
        frequency = go_frequency_dict.get(data_dict['go'], 0)
        go_frequency_dict[data_dict['go']] = frequency + 1

        # add GO term identification in the go term per sequence dictionary
        seq_go_list = go_per_seq_dict.get(data_dict['transcript_id'], [])
        if data_dict['go'] not in seq_go_list:
            seq_go_list.append(data_dict['go'])
            go_per_seq_dict[data_dict['transcript_id']] = seq_go_list

        # add sequence identication in the sequences per GO term dictionary
        go_seq_list = seq_per_go_dict.get(data_dict['go'], [])
        if data_dict['transcript_id'] not in go_seq_list:
            go_seq_list.append(data_dict['transcript_id'])
            seq_per_go_dict[data_dict['go']] = go_seq_list

        xlib.Message.print(
            'verbose',
            f'\rAnnotation file: {annotation_counter} processed records')

        # read the next record of the annotation file
        (record, key, data_dict) = xlib.read_trapid_annotation_record(
            annotation_file, annotation_file_id, annotation_counter)
        xlib.Message.print('trace', f'key: {key} - record: {record}')

    xlib.Message.print('verbose', '\n')

    # print summary
    xlib.Message.print(
        'info',
        f'{annotation_counter} annotation records in annotation file.'.format(
        ))

    # close annotation file
    annotation_file_id.close()

    # write the GO term frequency
    go_frequency_file = f'{output_dir}/trapid-go-frequency.csv'
    write_go_frequency(go_frequency_dict, go_ontology_dict, go_frequency_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(go_frequency_file)} is generated.')

    # write go terms per sequence
    go_per_seq_file = f'{output_dir}/trapid-go-per-seq.csv'
    write_go_per_seq(go_per_seq_dict, go_ontology_dict, go_per_seq_file)
    xlib.Message.print(
        'info', f'The file {os.path.basename(go_per_seq_file)} is generated.')

    # write sequence identification per go term
    seq_per_go_file = f'{output_dir}/trapid-seq-per-go.csv'
    write_seq_per_go(seq_per_go_dict, go_ontology_dict, seq_per_go_file)
    xlib.Message.print(
        'info', f'The file {os.path.basename(seq_per_go_file)} is generated.')
Esempio n. 6
0
def calculate_trinotate_go_stats(annotation_file, go_ontology_dict,
                                 output_dir):
    '''
    Calculate GO term statistics of a Trinotate annotation file.
    '''

    # initialize the dictionaries
    blastx_go_frequency_dict = xlib.NestedDefaultDict()
    blastx_go_per_seq_dict = xlib.NestedDefaultDict()
    blastx_seq_per_go_dict = xlib.NestedDefaultDict()
    blastp_go_frequency_dict = xlib.NestedDefaultDict()
    blastp_go_per_seq_dict = xlib.NestedDefaultDict()
    blastp_seq_per_go_dict = xlib.NestedDefaultDict()

    # open the annotation file
    if annotation_file.endswith('.gz'):
        try:
            annotation_file_id = gzip.open(annotation_file,
                                           mode='rt',
                                           encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', annotation_file)
    else:
        try:
            annotation_file_id = open(annotation_file,
                                      mode='r',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', annotation_file)

    # initialize the annotation counter
    annotation_counter = 0

    # read the first record of the annotation file (header)
    annotation_file_id.readline()

    # read the next record of the annotation file (first data record)
    (record, key, data_dict) = xlib.read_trinotate_annotation_record(
        annotation_file, annotation_file_id, annotation_counter)
    xlib.Message.print('trace', f'key: {key} - record: {record}')

    # while there are records
    while record != '':

        # add 1 to the annotation counter
        annotation_counter += 1

        # extract blastx GO term identifications and add them into the GO identification list
        # gene_ontology_blastx format: GO:id1^aspect1^desc1`GO:id2^aspect2^desc2`...`GO:idn^aspectn^descn
        # aspect values: biological process (P), molecular function (F), cellular component (C)
        blastx_go_id_list = []
        if data_dict['gene_ontology_blastx'] != '.':
            go_data_list = data_dict['gene_ontology_blastx'].split(r'`')
            for go_data in go_data_list:
                (go_id, go_aspect, go_des) = go_data.split('^')
                blastx_go_id_list.append(go_id)

        # increase the GO term counter in the blastx go term frequency dictionary
        for i in range(len(blastx_go_id_list)):
            go_id = blastx_go_id_list[i]
            frequency = blastx_go_frequency_dict.get(go_id, 0)
            blastx_go_frequency_dict[go_id] = frequency + 1

        # add GO term identifications in the blastx go terms per sequence dictionary
        seq_go_list = blastx_go_per_seq_dict.get(data_dict['transcript_id'],
                                                 [])
        for go_id in blastx_go_id_list:
            if go_id not in seq_go_list:
                seq_go_list.append(go_id)
        blastx_go_per_seq_dict[data_dict['transcript_id']] = seq_go_list

        # add sequence identication in the blastx sequences per GO term dictionary
        for go_id in blastx_go_id_list:
            go_seq_list = blastx_seq_per_go_dict.get(go_id, [])
            if data_dict['transcript_id'] not in go_seq_list:
                go_seq_list.append(data_dict['transcript_id'])
                blastx_seq_per_go_dict[go_id] = go_seq_list

        # extract blastp GO term identifications and add them into the GO identification list
        # gene_ontology_blastp format: GO:id1^aspect1^desc1`GO:id2^aspect2^desc2`...`GO:idn^aspectn^descn
        # aspect values: biological process (P), molecular function (F), cellular component (C)
        blastp_go_id_list = []
        if data_dict['gene_ontology_blastp'] != '.':
            go_data_list = data_dict['gene_ontology_blastp'].split(r'`')
            for go_data in go_data_list:
                (go_id, go_aspect, go_des) = go_data.split('^')
                blastp_go_id_list.append(go_id)

        # increase the GO term counter in the blastp go term frequency dictionary
        for i in range(len(blastp_go_id_list)):
            go_id = blastp_go_id_list[i]
            frequency = blastp_go_frequency_dict.get(go_id, 0)
            blastp_go_frequency_dict[go_id] = frequency + 1

        # add GO term identifications in the blastp go terms per sequence dictionary
        seq_go_list = blastp_go_per_seq_dict.get(data_dict['transcript_id'],
                                                 [])
        for go_id in blastp_go_id_list:
            if go_id not in seq_go_list:
                seq_go_list.append(go_id)
        blastp_go_per_seq_dict[data_dict['transcript_id']] = seq_go_list

        # add sequence identication in the blastp sequences per GO term dictionary
        for go_id in blastp_go_id_list:
            go_seq_list = blastp_seq_per_go_dict.get(go_id, [])
            if data_dict['transcript_id'] not in go_seq_list:
                go_seq_list.append(data_dict['transcript_id'])
                blastp_seq_per_go_dict[go_id] = go_seq_list

        xlib.Message.print(
            'verbose',
            f'\rAnnotation file: {annotation_counter} processed records')

        # read the next record of the annotation file
        (record, key, data_dict) = xlib.read_trinotate_annotation_record(
            annotation_file, annotation_file_id, annotation_counter)
        xlib.Message.print('trace', f'key: {key} - record: {record}')

    xlib.Message.print('verbose', '\n')

    # print summary
    xlib.Message.print(
        'info', f'{annotation_counter} annotation records in annotation file.')

    # close annotation file
    annotation_file_id.close()

    # write the GO term frequency
    blastx_go_frequency_file = f'{output_dir}/trinotate-blastx-go-frequency.csv'
    write_go_frequency(blastx_go_frequency_dict, go_ontology_dict,
                       blastx_go_frequency_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(blastx_go_frequency_file)} is generated.')
    blastp_go_frequency_file = f'{output_dir}/trinotate-blastp-go-frequency.csv'
    write_go_frequency(blastp_go_frequency_dict, go_ontology_dict,
                       blastp_go_frequency_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(blastp_go_frequency_file)} is generated.')

    # write go terms per sequence
    blastx_go_per_seq_file = f'{output_dir}/trinotate-blastx-go-per-seq.csv'
    write_go_per_seq(blastx_go_per_seq_dict, go_ontology_dict,
                     blastx_go_per_seq_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(blastx_go_per_seq_file)} is generated.')
    blastp_go_per_seq_file = f'{output_dir}/trinotate-blastp-go-per-seq.csv'
    write_go_per_seq(blastp_go_per_seq_dict, go_ontology_dict,
                     blastp_go_per_seq_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(blastp_go_per_seq_file)} is generated.')

    # write sequence identification per go term
    blastx_seq_per_go_file = f'{output_dir}/trinotate-blastx-seq-per-go.csv'
    write_seq_per_go(blastx_seq_per_go_dict, go_ontology_dict,
                     blastx_seq_per_go_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(blastx_seq_per_go_file)} is generated.')
    blastp_seq_per_go_file = f'{output_dir}/trinotate-blastp-seq-per-go.csv'
    write_seq_per_go(blastp_seq_per_go_dict, go_ontology_dict,
                     blastp_seq_per_go_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(blastp_seq_per_go_file)} is generated.')
Esempio n. 7
0
def calculate_toa_go_stats(annotation_file, go_ontology_dict, output_dir):
    '''
    Calculate GO term statistics of a TOA annotation file (only the sequence with less e-Value is considered).
    '''

    # initialize the statistics dictionaries
    go_frequency_dict = xlib.NestedDefaultDict()
    go_per_seq_dict = xlib.NestedDefaultDict()
    seq_per_go_dict = xlib.NestedDefaultDict()

    # open the annotation file
    if annotation_file.endswith('.gz'):
        try:
            annotation_file_id = gzip.open(annotation_file,
                                           mode='rt',
                                           encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', annotation_file)
    else:
        try:
            annotation_file_id = open(annotation_file,
                                      mode='r',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', annotation_file)

    # initialize the annotation counter
    annotation_counter = 0

    # read the first record of the annotation file (header)
    annotation_file_id.readline()

    # read the secord record of the annotation file (first data record)
    (record, key,
     data_dict) = xlib.read_toa_annotation_record(annotation_file,
                                                  annotation_file_id, 'MERGER',
                                                  annotation_counter)
    xlib.Message.print('trace', f'key: {key} - record: {record}')

    # while there are records
    while record != '':

        # initialize the old sequence identification
        old_nt_seq_id = data_dict['nt_seq_id']

        # initialize the minimum e-value
        min_evalue = 9999

        # while there are records and the same sequence identification
        while record != '' and data_dict['nt_seq_id'] == old_nt_seq_id:

            # add 1 to the annotation counter
            annotation_counter += 1

            # extract the GO term identifications and add them into the GO identification list
            # go_id format: "GO:id1*id2*...*idn"
            if data_dict['go_id'] != '':
                go_id_list = data_dict['go_id'][3:].split('*')
            else:
                go_id_list = []

            # save the go identification list of the sequence hit/hsp with less e-value
            if float(data_dict['hsp_evalue']) < min_evalue:
                min_evalue_go_id_list = go_id_list

            xlib.Message.print(
                'verbose',
                f'\rAnnotation file: {annotation_counter} processed records')

            # read the next record of the annotation file
            (record, key, data_dict) = xlib.read_toa_annotation_record(
                annotation_file, annotation_file_id, 'MERGER',
                annotation_counter)
            xlib.Message.print('trace', f'key: {key} - record: {record}')

        # increase the GO term counter in the go term frequency dictionary
        for i in range(len(min_evalue_go_id_list)):
            go_id = f'GO:{min_evalue_go_id_list[i]}'
            frequency = go_frequency_dict.get(go_id, 0)
            go_frequency_dict[go_id] = frequency + 1

        # add GO term identifications in the blastx go terms per sequence dictionary
        seq_go_list = go_per_seq_dict.get(old_nt_seq_id, [])
        for i in range(len(min_evalue_go_id_list)):
            go_id = f'GO:{min_evalue_go_id_list[i]}'
            if go_id not in seq_go_list:
                seq_go_list.append(go_id)
        go_per_seq_dict[old_nt_seq_id] = seq_go_list

        # add sequence identication in the blastx sequences per GO term dictionary
        for i in range(len(min_evalue_go_id_list)):
            go_id = f'GO:{min_evalue_go_id_list[i]}'
            go_seq_list = seq_per_go_dict.get(go_id, [])
            if old_nt_seq_id not in go_seq_list:
                go_seq_list.append(old_nt_seq_id)
                seq_per_go_dict[go_id] = go_seq_list

    xlib.Message.print('verbose', '\n')

    # print summary
    xlib.Message.print(
        'info', f'{annotation_counter} annotation records in annotation file.')

    # close annotation file
    annotation_file_id.close()

    # write the GO term frequency
    go_frequency_file = f'{output_dir}/toa-go-frequency.csv'
    write_go_frequency(go_frequency_dict, go_ontology_dict, go_frequency_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(go_frequency_file)} is generated.')

    # write go terms per sequence
    go_per_seq_file = f'{output_dir}/toa-go-per-seq.csv'
    write_go_per_seq(go_per_seq_dict, go_ontology_dict, go_per_seq_file)
    xlib.Message.print(
        'info', f'The file {os.path.basename(go_per_seq_file)} is generated.')

    # write sequence identification per go term
    seq_per_go_file = f'{output_dir}/toa-seq-per-go.csv'
    write_seq_per_go(seq_per_go_dict, go_ontology_dict, seq_per_go_file)
    xlib.Message.print(
        'info', f'The file {os.path.basename(seq_per_go_file)} is generated.')
Esempio n. 8
0
def calculate_entap_go_stats(annotation_file, go_ontology_dict, output_dir):
    '''
    Calculate GO term statistics of a EnTAP annotation file.
    '''

    # initialize the dictionaries
    go_frequency_dict = xlib.NestedDefaultDict()
    go_per_seq_dict = xlib.NestedDefaultDict()
    seq_per_go_dict = xlib.NestedDefaultDict()

    # open the annotation file
    if annotation_file.endswith('.gz'):
        try:
            annotation_file_id = gzip.open(annotation_file,
                                           mode='rt',
                                           encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', annotation_file)
    else:
        try:
            annotation_file_id = open(annotation_file,
                                      mode='r',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', annotation_file)

    # initialize the annotation counter
    annotation_counter = 0

    # read the first record of the annotation file (header)
    annotation_file_id.readline()

    # read the next record of the annotation file (first data record)
    (record, key,
     data_dict) = xlib.read_entap_annotation_record(annotation_file,
                                                    annotation_file_id,
                                                    annotation_counter)
    xlib.Message.print('trace', f'key: {key} - record: {record}')

    # while there are records
    while record != '':

        # add 1 to the annotation counter
        annotation_counter += 1

        # extract biological GO term identifications and add them into the GO identification list
        # go_biological format: "GO:id1-desc1,GO:id2-desc2,...,GO:idn-descn"
        go_id_list_1 = []
        if data_dict['go_biological'] != '':
            seq_go_data_list_1 = data_dict['go_biological'].split(',')
            for go_data in seq_go_data_list_1:
                if go_data.strip().startswith('GO:'):
                    go_id_list_1.append(go_data[:10])

        # extract cellular GO terms identifications and add them into the GO identification list
        # go_cellular format: "GO:id1-desc1,GO:id2-desc2,...,GO:idn-descn"
        go_id_list_2 = []
        if data_dict['go_cellular'] != '':
            seq_go_data_list_2 = data_dict['go_cellular'].split(',')
            for go_data in seq_go_data_list_2:
                if go_data.strip().startswith('GO:'):
                    go_id_list_2.append(go_data[:10])

        # extract molecular GO term identifications and add them into the GO identification list
        # go_molecular format: "GO:id1-desc1,GO:id2-desc2,...,GO:idn-descn"
        go_id_list_3 = []
        if data_dict['go_molecular'] != '':
            seq_go_data_list_3 = data_dict['go_molecular'].split(',')
            for go_data in seq_go_data_list_3:
                if go_data.strip().startswith('GO:'):
                    go_id_list_3.append(go_data[:10])

        # concat GO identification lists
        go_id_list = go_id_list_1 + go_id_list_2 + go_id_list_3

        # increase the GO term counters in the go term frequency dictionary
        for i in range(len(go_id_list)):
            go_id = go_id_list[i]
            counter = go_frequency_dict.get(go_id, 0)
            go_frequency_dict[go_id] = counter + 1

        # add GO term identifications in the go term per sequence dictionary
        seq_go_list = go_per_seq_dict.get(data_dict['query_sequence'], [])
        for go_id in go_id_list:
            if go_id not in seq_go_list:
                seq_go_list.append(go_id)
        go_per_seq_dict[data_dict['query_sequence']] = seq_go_list

        # add sequence identication in the sequences per GO term dictionary
        for go_id in go_id_list:
            go_seq_list = seq_per_go_dict.get(go_id, [])
            if data_dict['query_sequence'] not in go_seq_list:
                go_seq_list.append(data_dict['query_sequence'])
                seq_per_go_dict[go_id] = go_seq_list

        xlib.Message.print(
            'verbose',
            f'\rAnnotation file: {annotation_counter} processed records')

        # read the next record of the annotation file
        (record, key, data_dict) = xlib.read_entap_annotation_record(
            annotation_file, annotation_file_id, annotation_counter)
        xlib.Message.print('trace', f'key: {key} - record: {record}')

    xlib.Message.print('verbose', '\n')

    # print summary
    xlib.Message.print(
        'info', f'{annotation_counter} annotation records in annotation file.')

    # close annotation file
    annotation_file_id.close()

    # write the GO term frequency
    go_frequency_file = f'{output_dir}/entap-go-frequency.csv'
    write_go_frequency(go_frequency_dict, go_ontology_dict, go_frequency_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(go_frequency_file)} is generated.')

    # write go terms per sequence
    go_per_seq_file = f'{output_dir}/entap-go-per-seq.csv'
    write_go_per_seq(go_per_seq_dict, go_ontology_dict, go_per_seq_file)
    xlib.Message.print(
        'info', f'The file {os.path.basename(go_per_seq_file)} is generated.')

    # write sequence identification per go term
    seq_per_go_file = f'{output_dir}/entap-seq-per-go.csv'
    write_seq_per_go(seq_per_go_dict, go_ontology_dict, seq_per_go_file)
    xlib.Message.print(
        'info', f'The file {os.path.basename(seq_per_go_file)} is generated.')
Esempio n. 9
0
def calculate_blast2go_go_stats(annotation_file, go_ontology_dict, output_dir):
    '''
    Calculate GO term statistics of a Blast2GO annotation file.
    '''

    # initialize the dictionaries
    go_frequency_dict = xlib.NestedDefaultDict()
    go_per_seq_dict = xlib.NestedDefaultDict()
    seq_per_go_dict = xlib.NestedDefaultDict()

    # open the annotation file
    if annotation_file.endswith('.gz'):
        try:
            annotation_file_id = gzip.open(annotation_file,
                                           mode='rt',
                                           encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', annotation_file)
    else:
        try:
            annotation_file_id = open(annotation_file,
                                      mode='r',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', annotation_file)

    # initialize the annotation counter
    annotation_counter = 0

    # read the first record of the annotation file (header)
    annotation_file_id.readline()

    # read the next record of the annotation file (first data record)
    (record, key,
     data_dict) = xlib.read_blast2go_annotation_record(annotation_file,
                                                       annotation_file_id,
                                                       annotation_counter)
    xlib.Message.print('trace', f'key: {key} - record: {record}')

    # while there are records
    while record != '':

        # add 1 to the annotation counter
        annotation_counter += 1

        # extract GO term identifications and add them into the GO identification list
        # go_ids format: "aspect1:GO:id1;aspect2:GO:id2;...;aspectn:GO:idn"
        # aspect values values: P (biological process), F (molecular function), C (cellular component)
        go_id_list_1 = []
        if data_dict['go_ids'] != '':
            seq_go_id_list = data_dict['go_ids'].split(';')
            for i in range(len(seq_go_id_list)):
                go_id_list_1.append(seq_go_id_list[i].strip()[2:])

        # extract InterPro GO term identifications and add them into the GO identification list
        # interpro_go_ids format: "aspect1:GO:id1;aspect2:GO:id2;...;aspectn:GO:idn"
        # aspect values values: P (biological process), F (molecular function), C (cellular component)
        go_id_list_2 = []
        if data_dict['interpro_go_ids'] not in [
                '', 'no GO terms', 'no IPS match'
        ]:
            seq_go_id_list = data_dict['interpro_go_ids'].split(';')
            for i in range(len(seq_go_id_list)):
                go_id_list_2.append(seq_go_id_list[i].strip()[2:])

        # concat GO identification lists
        go_id_list = go_id_list_1 + go_id_list_2

        # increase the GO term counters in the go term frequency dictionary
        for i in range(len(go_id_list)):
            go_id = go_id_list[i]
            counter = go_frequency_dict.get(go_id, 0)
            go_frequency_dict[go_id] = counter + 1

        # add GO term identifications in the go terms per sequence dictionary
        seq_go_list = go_per_seq_dict.get(data_dict['seq_name'], [])
        for go_id in go_id_list:
            if go_id not in seq_go_list:
                seq_go_list.append(go_id)
        go_per_seq_dict[data_dict['seq_name']] = seq_go_list

        # add sequence identication in the sequences per GO term dictionary
        for go_id in go_id_list:
            go_seq_list = seq_per_go_dict.get(go_id, [])
            if data_dict['seq_name'] not in go_seq_list:
                go_seq_list.append(data_dict['seq_name'])
                seq_per_go_dict[go_id] = go_seq_list

        xlib.Message.print(
            'verbose',
            f'\rAnnotation file: {annotation_counter} processed records')

        # read the next record of the annotation file
        (record, key, data_dict) = xlib.read_blast2go_annotation_record(
            annotation_file, annotation_file_id, annotation_counter)
        xlib.Message.print('trace', f'key: {key} - record: {record}')

    xlib.Message.print('verbose', '\n')

    # print summary
    xlib.Message.print(
        'info', f'{annotation_counter} annotation records in annotation file.')

    # close annotation file
    annotation_file_id.close()

    # write the GO term frequency
    go_frequency_file = f'{output_dir}/blast2go-go-frequency.csv'
    write_go_frequency(go_frequency_dict, go_ontology_dict, go_frequency_file)
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(go_frequency_file)} is generated.')

    # write go terms per sequence
    go_per_seq_file = f'{output_dir}/blast2go-go-per-seq.csv'
    write_go_per_seq(go_per_seq_dict, go_ontology_dict, go_per_seq_file)
    xlib.Message.print(
        'info', f'The file {os.path.basename(go_per_seq_file)} is generated.')

    # write sequence identification per go term
    seq_per_go_file = f'{output_dir}/blast2go-seq-per-go.csv'
    write_seq_per_go(seq_per_go_dict, go_ontology_dict, seq_per_go_file)
    xlib.Message.print(
        'info', f'The file {os.path.basename(seq_per_go_file)} is generated.')
Esempio n. 10
0
def query_data(conn, file_name, sp1_id, sp2_id, hybrid_id, imputed_md_id,
               max_separation, output_dir, tsi_list):
    '''
    List data of variants and alleles and variant identifications to the scenario X.
    '''

    # check if the table "gene_info" is loaded
    xlib.Message.print('verbose', 'Checking the table "gene_info" ...\n')
    if xsqlite.check_gene_info(conn) == 0:
        raise xlib.ProgramException('', 'B003', 'gene_info')
    xlib.Message.print('verbose', 'The table is loaded.\n')

    # check if the table "genomic_features" is loaded
    xlib.Message.print('verbose',
                       'Checking the table "genomic_features" ...\n')
    if xsqlite.check_genomic_features(conn) == 0:
        raise xlib.ProgramException('', 'B003', 'genomic_features')
    xlib.Message.print('verbose', 'The table is loaded.\n')

    # check if the table "vcf_samples" is loaded
    xlib.Message.print('verbose', 'Checking the table "vcf_samples" ...\n')
    if xsqlite.check_vcf_samples(conn) == 0:
        raise xlib.ProgramException('', 'B003', 'vcf_samples')
    xlib.Message.print('verbose', 'The table is loaded.\n')

    # check if the table "vcf_variants" is loaded
    xlib.Message.print('verbose', 'Checking the table "vcf_variants" ...\n')
    if xsqlite.check_vcf_variants(conn) == 0:
        raise xlib.ProgramException('', 'B003', 'vcf_variants')
    xlib.Message.print('verbose', 'The table is loaded.\n')

    # check if the table "vcf_alleles" is loaded
    xlib.Message.print('verbose', 'Checking the table "vcf_alleles" ...\n')
    if xsqlite.check_vcf_alleles(conn) == 0:
        raise xlib.ProgramException('', 'B003', 'vcf_alleles')
    xlib.Message.print('verbose', 'The table is loaded.\n')

    # check if the table "vcf_samples_alleles" is loaded
    xlib.Message.print('verbose',
                       'Checking the table "vcf_samples_alleles" ...\n')
    if xsqlite.check_vcf_samples_alleles(conn) == 0:
        raise xlib.ProgramException('', 'B003', 'vcf_samples_alleles')
    xlib.Message.print('verbose', 'The table is loaded.\n')

    # get the variant dictionary
    xlib.Message.print('verbose', 'Getting variant data ...\n')
    variant_dict = xsqlite.query_variants(conn)
    xlib.Message.print('verbose', 'Data are got.\n')

    # get the allele dictionary
    xlib.Message.print('verbose', 'Getting allele data ...\n')
    allele_dict = xsqlite.get_vcf_alleles_dict(conn)
    xlib.Message.print('verbose', 'Data are got.\n')

    # get the imputated allele dictionary
    xlib.Message.print('verbose', 'Getting imputated allele data ...\n')
    imputed_allele_dict = xsqlite.query_imputed_alleles(conn, imputed_md_id)
    xlib.Message.print('verbose', 'Data are got.\n')

    # get the dictionary of allele frecuency per species
    xlib.Message.print(
        'verbose',
        'Getting the dictionary of allele frecuency per species ...\n')
    species_allele_frequency_dict = xsqlite.query_species_allele_frequencies(
        conn, xlib.get_md_symbol())
    xlib.Message.print('verbose', 'Data are got.\n')

    # get the dictionary of allele frecuency per species
    xlib.Message.print(
        'verbose',
        'Getting the dictionary of allele frecuency per species and type ...\n'
    )
    species_and_type_allele_frequency_dict = xsqlite.query_species_and_type_allele_frequencies(
        conn, xlib.get_md_symbol())
    xlib.Message.print('verbose', 'Data are got.\n')

    #-------------------------------------------------------------------------------
    # build the intergenic variant dictionary
    #-------------------------------------------------------------------------------

    xlib.Message.print('verbose',
                       'Building the intergenic variant dictionary ...\n')

    # initialize intergenic variant dictionary
    intergenic_variant_dict = xlib.NestedDefaultDict()

    # initialize the current item
    i = 0

    # while there are items in the variant dictionary
    while i < len(variant_dict):

        # initialize data
        variant_id = variant_dict[i]['variant_id']
        seq_id = ''
        position = 0
        found_gene = False
        found_exon = False

        # while there are items in the variant dictionary and the items have the same variant identification
        while i < len(
                variant_dict) and variant_id == variant_dict[i]['variant_id']:

            # save data
            variant_id = variant_dict[i]['variant_id']
            seq_id = variant_dict[i]['seq_id']
            position = variant_dict[i]['position']
            gene = variant_dict[i]['gene']

            # next item
            i += 1

        # add item to the intergenic variant dictionary
        if gene == 'N/A':
            intergenic_variant_dict[seq_id][position] = {
                'variant_id': variant_id
            }

    xlib.Message.print('verbose', 'Dictionary is built.\n')

    #-------------------------------------------------------------------------------
    # build the intergenic fragment dictionary
    #-------------------------------------------------------------------------------

    xlib.Message.print('verbose',
                       'Building the intergenic fragment dictionary ...\n')

    # initialize the fragment dictionary
    fragment_dict = xlib.NestedDefaultDict()

    # for each sequence identification in the intergenic variant dictionary
    for seq_id in sorted(intergenic_variant_dict.keys()):

        if seq_id in tsi_list: xlib.Message.print('trace', f'seq_id: {seq_id}')
        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'intergenic_variant_dict[seq_id]: {intergenic_variant_dict[seq_id]}'
            )

        # initialize control variable for the first variant in the sequence
        first_variant = True

        # initialize the fragment number
        fragment_num = 0

        # for each position in the sequence
        for position in sorted(intergenic_variant_dict[seq_id]):

            # first variant in the sequence
            if first_variant:
                first_variant = False
                variant_id = intergenic_variant_dict[seq_id][position][
                    'variant_id']
                fragment_id = f'{seq_id}-F{fragment_num:03d}'
                fragment_dict[variant_id] = {'fragment_id': fragment_id}
                old_position = position

            # the following variants in the sequence
            else:

                # when the position is less or equal to the maximum separation between variants of the same intergenic fragment
                if position <= old_position + max_separation:
                    variant_id = intergenic_variant_dict[seq_id][position][
                        'variant_id']
                    fragment_id = f'{seq_id}-F{fragment_num:03d}'
                    fragment_dict[variant_id] = {'fragment_id': fragment_id}

                # when the position is greater to the maximum separation between variants of the same intergenic fragment
                else:
                    fragment_num += 1
                    variant_id = intergenic_variant_dict[seq_id][position][
                        'variant_id']
                    fragment_id = f'{seq_id}-F{fragment_num:03d}'
                    fragment_dict[variant_id] = {'fragment_id': fragment_id}
                    old_position = position

    xlib.Message.print('verbose', 'Dictionary is built.\n')

    #-------------------------------------------------------------------------------
    # Create the variant file
    #-------------------------------------------------------------------------------

    xlib.Message.print('verbose', 'Writting the variant file ...\n')

    # initialize the imputation dictionary
    imputation_dict = xlib.NestedDefaultDict()

    # open the output variant file
    variant_file = f'{output_dir}/{file_name}-data2scenarioX-variants.csv'
    try:
        variant_file_id = open(variant_file,
                               mode='w',
                               encoding='iso-8859-1',
                               newline='\n')
    except Exception as e:
        raise xlib.ProgramException(e, 'F003', variant_file)

    # write head record of the output variant file
    variant_file_id.write(
        '"variant_id";"seq_id";"position";"genomic_zone";"gene/fragment";"description";"chromosome_id";"imputations"\n'
    )

    # initialize the current item
    i = 0

    # while there are items in the variant dictionary
    while i < len(variant_dict):

        # initialize data
        variant_id = variant_dict[i]['variant_id']
        found_gene = False
        found_exon = False

        # while there are items in the variant dictionary and the items have the same variant identification
        while i < len(
                variant_dict) and variant_id == variant_dict[i]['variant_id']:

            # save data
            seq_id = variant_dict[i]['seq_id']
            position = variant_dict[i]['position']
            start = variant_dict[i]['start']
            end = variant_dict[i]['end']
            gene = variant_dict[i]['gene']
            description = variant_dict[i]['description']
            if description == None:
                description = 'N/A'
            chromosome_id = variant_dict[i]['chromosome_id']
            if chromosome_id == None:
                chromosome_id = 'N/A'
            if variant_dict[i]['gene'] != 'N/A':
                gene_or_fragment = variant_dict[i]['gene']
            else:
                gene_or_fragment = fragment_dict[variant_id]['fragment_id']
            if variant_dict[i]['type'] in ['gene', 'pseudogene']:
                found_gene = True
            elif variant_dict[i]['type'] == 'exon':
                found_exon = True

            # next item
            i += 1

        # set genomic_zone
        if end == 0:
            genomic_zone = 'N/A'
        elif not found_gene:
            genomic_zone = 'intergenic'
        elif found_exon:
            genomic_zone = 'exonic'
        else:
            genomic_zone = 'intronic'

        # set imputations
        if imputed_allele_dict.get(variant_id, 0) == 0:
            imputations = 'N'
        else:
            imputations = 'Y'

        # add variant dictionary to the gene dictionary
        imputation_dict[gene_or_fragment][variant_id] = {
            'imputations': imputations
        }

        # write data
        variant_file_id.write(
            f'"{variant_id}";"{seq_id}";{position};"{genomic_zone}";"{gene_or_fragment}";"{description}";"{chromosome_id}";"{imputations}"\n'
        )

    # print OK message
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(variant_file)} containing variant data is created.'
    )

    # close the output variant file
    variant_file_id.close()

    #-------------------------------------------------------------------------------
    # Create the allele file
    #-------------------------------------------------------------------------------

    xlib.Message.print('verbose', 'Writting the allele file ...\n')

    # open the output allele file
    allele_file = f'{output_dir}/{file_name}-data2scenarioX-alleles.csv'
    try:
        allele_file_id = open(allele_file,
                              mode='w',
                              encoding='iso-8859-1',
                              newline='\n')
    except Exception as e:
        raise xlib.ProgramException(e, 'F003', allele_file)

    # write head record of the output allele file
    allele_file_id.write(
        f'"variant_id";"seq_id";"position";"genomic_zone";"gene/fragment";"description";"chromosome_id";"imputations";"allele_id";"bases";"{sp1_id}_frequency";"{sp2_id}_frequency";"{hybrid_id}_frequency";"{sp1_id}_mothers_frequency";"{sp2_id}_mothers_frequency";"{hybrid_id}_mothers_frequency";"{sp1_id}_progenies_frequency";"{sp2_id}_progenies_frequency";"{hybrid_id}_progenies_frequency"\n'
    )

    # initialize the current item
    i = 0

    # while there are items in the variant dictionary
    while i < len(variant_dict):

        # initialize data
        variant_id = variant_dict[i]['variant_id']
        found_gene = False
        found_exon = False

        if seq_id in tsi_list:
            xlib.Message.print('trace', f'variant_id: {variant_id}')

        # while there are items in the variant dictionary and the items have the same variant identification
        while i < len(
                variant_dict) and variant_id == variant_dict[i]['variant_id']:

            # save data
            seq_id = variant_dict[i]['seq_id']
            position = variant_dict[i]['position']
            start = variant_dict[i]['start']
            end = variant_dict[i]['end']
            gene = variant_dict[i]['gene']
            description = variant_dict[i]['description']
            if description == None:
                description = 'N/A'
            chromosome_id = variant_dict[i]['chromosome_id']
            if chromosome_id == None:
                chromosome_id = 'N/A'
            if variant_dict[i]['gene'] != 'N/A':
                gene_or_fragment = variant_dict[i]['gene']
            else:
                gene_or_fragment = fragment_dict[variant_id]['fragment_id']
            if variant_dict[i]['type'] in ['gene', 'pseudogene']:
                found_gene = True
            elif variant_dict[i]['type'] == 'exon':
                found_exon = True

            # next item
            i += 1

        # set genomic_zone
        if end == 0:
            genomic_zone = 'N/A'
        elif not found_gene:
            genomic_zone = 'intergenic'
        elif found_exon:
            genomic_zone = 'exonic'
        else:
            genomic_zone = 'intronic'

        # set imputations
        if imputed_allele_dict.get(variant_id, 0) == 0:
            imputations = 'N'
        else:
            imputations = 'Y'

        # build the frecuency summation dictionary of every species per allele
        species_frecuency_summation_dict = {}
        for species_id in species_allele_frequency_dict[variant_id].keys():
            for allele_id in species_allele_frequency_dict[variant_id][
                    species_id].keys():
                allele_data_dict = species_frecuency_summation_dict.get(
                    allele_id, {
                        sp1_id: 0,
                        sp2_id: 0,
                        hybrid_id: 0
                    })
                allele_data_dict[species_id] += species_allele_frequency_dict[
                    variant_id][species_id][allele_id]['frecuency_sum']
                species_frecuency_summation_dict[allele_id] = allele_data_dict

        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'species_frecuency_summation_dict: {species_frecuency_summation_dict}'
            )

        # build the frecuency summation dictionary of every species and type per allele
        # species_and_type_allele_frequency_dict
        species_and_type_frecuency_summation_dict = {}
        for species_id in species_and_type_allele_frequency_dict[
                variant_id].keys():
            for type in species_and_type_allele_frequency_dict[variant_id][
                    species_id].keys():
                for allele_id in species_and_type_allele_frequency_dict[
                        variant_id][species_id][type].keys():
                    allele_data_dict = species_and_type_frecuency_summation_dict.get(
                        allele_id, {
                            f'{sp1_id}_mothers': 0,
                            f'{sp2_id}_mothers': 0,
                            f'{hybrid_id}_mothers': 0,
                            f'{sp1_id}_progenies': 0,
                            f'{sp2_id}_progenies': 0,
                            f'{hybrid_id}_progenies': 0
                        })
                    if type == 'MOTHER':
                        data_id = f'{species_id}_mothers'
                    elif type == 'PROGENY':
                        data_id = f'{species_id}_progenies'
                    allele_data_dict[
                        data_id] += species_and_type_allele_frequency_dict[
                            variant_id][species_id][type][allele_id][
                                'frecuency_sum']
                    species_and_type_frecuency_summation_dict[
                        allele_id] = allele_data_dict

        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'species_and_type_frecuency_summation_dict: {species_and_type_frecuency_summation_dict}'
            )

        # calculate the allelle frecuency totals per species
        allele_frecuency_total_sp1 = 0
        allele_frecuency_total_sp2 = 0
        allele_frecuency_total_hybrid = 0
        for allele_id in species_frecuency_summation_dict.keys():
            allele_frecuency_total_sp1 += species_frecuency_summation_dict.get(
                allele_id, {}).get(sp1_id, 0)
            allele_frecuency_total_sp2 += species_frecuency_summation_dict.get(
                allele_id, {}).get(sp2_id, 0)
            allele_frecuency_total_hybrid += species_frecuency_summation_dict.get(
                allele_id, {}).get(hybrid_id, 0)

        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_sp1: {allele_frecuency_total_sp1}')
        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_sp2: {allele_frecuency_total_sp2}')
        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_hybrid: {allele_frecuency_total_hybrid}'
            )

        # calculate the allelle frecuency totals per species and type
        allele_frecuency_total_sp1_mothers = 0
        allele_frecuency_total_sp2_mothers = 0
        allele_frecuency_total_hybrid_mothers = 0
        allele_frecuency_total_sp1_progenies = 0
        allele_frecuency_total_sp2_progenies = 0
        allele_frecuency_total_hybrid_progenies = 0
        for allele_id in species_frecuency_summation_dict.keys():
            allele_frecuency_total_sp1_mothers += species_and_type_frecuency_summation_dict.get(
                allele_id, {}).get(f'{sp1_id}_mothers', 0)
            allele_frecuency_total_sp2_mothers += species_and_type_frecuency_summation_dict.get(
                allele_id, {}).get(f'{sp2_id}_mothers', 0)
            allele_frecuency_total_hybrid_mothers += species_and_type_frecuency_summation_dict.get(
                allele_id, {}).get(f'{hybrid_id}_mothers', 0)
            allele_frecuency_total_sp1_progenies += species_and_type_frecuency_summation_dict.get(
                allele_id, {}).get(f'{sp1_id}_progenies', 0)
            allele_frecuency_total_sp2_progenies += species_and_type_frecuency_summation_dict.get(
                allele_id, {}).get(f'{sp2_id}_progenies', 0)
            allele_frecuency_total_hybrid_progenies += species_and_type_frecuency_summation_dict.get(
                allele_id, {}).get(f'{hybrid_id}_progenies', 0)

        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_sp1_mothers: {allele_frecuency_total_sp1_mothers}'
            )
        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_sp2_mothers: {allele_frecuency_total_sp2_mothers}'
            )
        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_hybrid_mothers: {allele_frecuency_total_hybrid_mothers}'
            )
        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_sp1_progenies: {allele_frecuency_total_sp1_progenies}'
            )
        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_sp2_progenies: {allele_frecuency_total_sp2_progenies}'
            )
        if seq_id in tsi_list:
            xlib.Message.print(
                'trace',
                f'allele_frecuency_total_hybrid_progenies: {allele_frecuency_total_hybrid_progenies}'
            )

        # for each allele of the variant
        for allele_id in species_frecuency_summation_dict.keys():

            # calculate the relative frequency of each specie per allele
            try:
                sp1_frequency = species_frecuency_summation_dict[allele_id][
                    sp1_id] / allele_frecuency_total_sp1
            except:
                sp1_frequency = 'N/A'
            try:
                sp2_frequency = species_frecuency_summation_dict[allele_id][
                    sp2_id] / allele_frecuency_total_sp2
            except:
                sp2_frequency = 'N/A'
            try:
                hybrid_frequency = species_frecuency_summation_dict[allele_id][
                    hybrid_id] / allele_frecuency_total_hybrid
            except:
                hybrid_frequency = 'N/A'

            # calculate the relative frequency of each specie and type per allele
            try:
                sp1_mothers_frequency = species_and_type_frecuency_summation_dict.get(
                    allele_id, {}).get(f'{sp1_id}_mothers',
                                       0) / allele_frecuency_total_sp1_mothers
            except:
                sp1_mothers_frequency = 'N/A'
            try:
                sp2_mothers_frequency = species_and_type_frecuency_summation_dict.get(
                    allele_id, {}).get(f'{sp2_id}_mothers',
                                       0) / allele_frecuency_total_sp2_mothers
            except:
                sp2_mothers_frequency = 'N/A'
            try:
                hybrid_mothers_frequency = species_and_type_frecuency_summation_dict.get(
                    allele_id,
                    {}).get(f'{hybrid_id}_mothers',
                            0) / allele_frecuency_total_hybrid_mothers
            except:
                hybrid_mothers_frequency = 'N/A'
            try:
                sp1_progenies_frequency = species_and_type_frecuency_summation_dict.get(
                    allele_id,
                    {}).get(f'{sp1_id}_progenies',
                            0) / allele_frecuency_total_sp1_progenies
            except:
                sp1_progenies_frequency = 'N/A'
            try:
                sp2_progenies_frequency = species_and_type_frecuency_summation_dict.get(
                    allele_id,
                    {}).get(f'{sp2_id}_progenies',
                            0) / allele_frecuency_total_sp2_progenies
            except:
                sp2_progenies_frequency = 'N/A'
            try:
                hybrid_progenies_frequency = species_and_type_frecuency_summation_dict.get(
                    allele_id,
                    {}).get(f'{hybrid_id}_progenies',
                            0) / allele_frecuency_total_hybrid_progenies
            except:
                hybrid_progenies_frequency = 'N/A'

            # get bases sequence
            bases = allele_dict[variant_id][allele_id]['bases']

            # write data variant identification
            allele_file_id.write(
                f'"{variant_id}";"{seq_id}";{position};"{genomic_zone}";"{gene_or_fragment}";"{description}";"{chromosome_id}";"{imputations}";"{allele_id}";"{bases}";"{sp1_frequency}";"{sp2_frequency}";"{hybrid_frequency}";"{sp1_mothers_frequency}";"{sp2_mothers_frequency}";"{hybrid_mothers_frequency}";"{sp1_progenies_frequency}";"{sp2_progenies_frequency}";"{hybrid_progenies_frequency}"\n'
            )

    # print OK message
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(allele_file)} containing allele data is created.'
    )

    # close the output allele file
    allele_file_id.close()

    #-------------------------------------------------------------------------------
    # Create the selected variant id file corresponding to the scenario X
    #-------------------------------------------------------------------------------

    xlib.Message.print(
        'verbose',
        'Writting the file with selected variant id corresponding to the scenario X...\n'
    )

    # open the output file with variant ids corresponding to the scenario X
    selected_id_file = f'{output_dir}/{file_name}-data2scenarioX-selected_ids.txt'
    try:
        selected_id_file_id = open(selected_id_file,
                                   mode='w',
                                   encoding='iso-8859-1',
                                   newline='\n')
    except Exception as e:
        raise xlib.ProgramException(e, 'F003', selected_id_file)

    # for every gene/fragment write the variant identifications corresponding to scenario X
    for gene_or_fragment in sorted(imputation_dict.keys()):

        # initialize control variables
        imputations_with_y = False
        imputations_with_n = False

        # check imputations of variants of this gene/fragment
        for variant_id in imputation_dict[gene_or_fragment].keys():
            if imputation_dict[gene_or_fragment][variant_id][
                    'imputations'] == 'Y':
                imputations_with_y = True
            else:
                imputations_with_n = True

        # write variant identitications
        for variant_id in (imputation_dict[gene_or_fragment].keys()):
            if imputations_with_y == True and imputations_with_n == False or imputation_dict[
                    gene_or_fragment][variant_id]['imputations'] == 'N':
                selected_id_file_id.write(f'{variant_id}\n')

    # print OK message
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(selected_id_file)} containing selected ids is created.'
    )

    # close the output allele file
    selected_id_file_id.close()
Esempio n. 11
0
def filter_ssrs(cos_file, ssr_file, output_file):
    '''
    Filter a SSR file transcripts selecting SSRs included in a COS.
    '''

    # initialize the contig dictionary
    contig_dict = xlib.NestedDefaultDict()

    # open the COS file
    if cos_file.endswith('.gz'):
        try:
            cos_file_id = gzip.open(cos_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', cos_file)
    else:
        try:
            cos_file_id = open(cos_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', cos_file)

    # initialize counters
    cos_record_counter = 0
    cos_seq_counter = 0

    # read the first record of COS file
    record = cos_file_id.readline()
    cos_record_counter += 1

    # while there are records in COS file
    while record != '':

        # process the head record
        if record.startswith('>'):

            # add 1 to the COS sequences counter
            cos_seq_counter += 1

            # extract head data
            head_data = record[1:].strip('\n')
            head_data_list = []
            pos_1 = 0
            for pos_2 in [
                    i for i, char in enumerate(head_data) if char == ':'
            ]:
                head_data_list.append(head_data[pos_1:pos_2])
                pos_1 = pos_2 + 1
            head_data_list.append(head_data[pos_1:].strip('\n'))
            try:
                contig_name = head_data_list[2]
                cos_star_end = head_data_list[3]
                pos_sep = head_data_list[3].find('-')
                cos_start = int(head_data_list[3][:pos_sep])
                cos_end = int(head_data_list[3][pos_sep + 1:])
            except Exception as e:
                raise xlib.ProgramException(e, 'F006',
                                            os.path.basename(cos_file),
                                            cos_record_counter)

            # initialize the COS sequence
            cos_seq = ''

            # read the next record
            record = cos_file_id.readline()
            cos_record_counter += 1

        else:

            # control the FASTA format
            raise xlib.ProgramException('', 'F006', cos_file, 'FASTA')

        # while there are records and they are COS sequence
        while record != '' and not record.startswith('>'):

            # concatenate the record to the COS sequence
            cos_seq += record.strip()

            # read the next record of COS file
            record = cos_file_id.readline()
            cos_record_counter += 1

        # add item in the COS dictionary
        # -- contig_dict[contig_id][cos_star_end] = {'cos_start': cos_start, 'cos_end': cos_end, 'cos_seq': cos_seq}
        contig_dict[contig_name][cos_star_end] = {
            'cos_start': cos_start,
            'cos_end': cos_end
        }

        # print the COST sequence counter
        xlib.Message.print('verbose',
                           f'\rProcessed COS seqs ... {cos_seq_counter:8d}')

    xlib.Message.print('verbose', '\n')

    # close files
    cos_file_id.close()

    # open the input SSR file
    if ssr_file.endswith('.gz'):
        try:
            ssr_file_id = gzip.open(ssr_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', ssr_file)
    else:
        try:
            ssr_file_id = open(ssr_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', ssr_file)

    # open the ouput SSR file
    if output_file.endswith('.gz'):
        try:
            output_file_id = gzip.open(output_file,
                                       mode='wt',
                                       encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', output_file)
    else:
        try:
            output_file_id = open(output_file, mode='w', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', output_file)

    # initialize counters
    input_record_counter = 0
    output_record_counter = 0

    # read the first record of SSR file
    record = ssr_file_id.readline()
    input_record_counter += 1

    # while there are records in input SSR file
    while record != '':

        # when record is the head
        if input_record_counter == 1:
            output_file_id.write(record)
            output_record_counter += 1

        # when record is not the head
        else:

            # extract SSR data
            ssr_data = record[1:].strip('\n')
            ssr_data_list = []
            pos_1 = 0
            for pos_2 in [
                    i for i, char in enumerate(ssr_data) if char == '\t'
            ]:
                ssr_data_list.append(ssr_data[pos_1:pos_2])
                pos_1 = pos_2 + 1
            ssr_data_list.append(ssr_data[pos_1:].strip('\n'))
            try:
                contig_name = ssr_data_list[0][:ssr_data_list[0].find(' ')]
                ssr_start = int(ssr_data_list[2])
                ssr_end = int(ssr_data_list[3])
            except Exception as e:
                raise xlib.ProgramException(e, 'F006',
                                            os.path.basename(cos_file),
                                            cos_record_counter)

            # get COS data of the contig
            cos_data_dict = contig_dict[contig_name]

            # write the SSR when it is into a COS
            for _, cos_data_dict in cos_data_dict.items():
                cos_start = cos_data_dict['cos_start']
                cos_end = cos_data_dict['cos_end']
                if ssr_start <= cos_end and ssr_end >= cos_start:
                    output_file_id.write(record)
                    output_record_counter += 1
                    break

        # print the COST sequence counter
        xlib.Message.print(
            'verbose',
            f'\rInput records ... {input_record_counter:8d} - Output records ... {output_record_counter:8d}'
        )

        # read the next record of SSR file
        record = ssr_file_id.readline()
        input_record_counter += 1

    xlib.Message.print('verbose', '\n')

    # close files
    ssr_file_id.close()
    output_file_id.close()

    # print OK message
    print(
        f'\nThe file {os.path.basename(output_file)} containing the selected SSRs is created.'
    )