コード例 #1
0
ファイル: load-basic-data.py プロジェクト: GGFHF/TOA
def load_table_species(conn, species_file):
    '''
    '''
    
    # drop table "species" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "species" ...\n')
    xsqlite.drop_species(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')
    
    # create table "species"
    xlib.Message.print('verbose', 'Creating the table "species" ...\n')
    xsqlite.create_species(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # open the file of species data
    if species_file.endswith('.gz'):
        try:
            species_file_id = gzip.open(species_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', species_file)
    else:
        try:
            species_file_id = open(species_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', species_file)

    # set the pattern of the data records
    # format: "species_name";"plaza_id"
    record_pattern = re.compile(r'^"(.*)";"(.*)"$')

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # read the first record
    record = species_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.lstrip().startswith('#') and record.strip() != '':

            # initialize the row data dictionary
            row_dict = {}

            # extract data 
            try:
                mo = record_pattern.match(record)
                row_dict['species_name'] = mo.group(1).strip().capitalize()
                row_dict['plaza_species_id'] = mo.group(2).strip().lower()
            except Exception as e:
                raise xlib.ProgramException('F006', os.path.basename(species_file), record_counter)

            # get the taxonomy dictionary of the species name from taxonomy server
            taxonomy_dict = xlib.get_taxonomy_dict('name', row_dict['species_name'])
            if taxonomy_dict == {}:
                row_dict['family_name'] = xlib.get_na()
                row_dict['phylum_name'] = xlib.get_na()
                row_dict['kingdom_name'] = xlib.get_na()
                row_dict['superkingdom_name'] = xlib.get_na()
                row_dict['tax_id'] = xlib.get_na()
            else:
                row_dict['family_name'] = taxonomy_dict['family']['name']
                row_dict['phylum_name'] = taxonomy_dict['phylum']['name']
                row_dict['kingdom_name'] = taxonomy_dict['kingdom']['name']
                row_dict['superkingdom_name'] = taxonomy_dict['superkingdom']['name']
                row_dict['tax_id'] = taxonomy_dict['tax_id']

            # insert data into table species
            xsqlite.insert_species_row(conn, row_dict)
            inserted_row_counter += 1

        # print record counter
        xlib.Message.print('verbose', f'\rProcessed records of species file: {record_counter} - Inserted rows: {inserted_row_counter}')

        # read the next record
        record = species_file_id.readline()

    xlib.Message.print('verbose', '\n')
    
    # create the index on the table "species"
    xlib.Message.print('verbose', 'Creating the index on the table "species" ...\n')
    xsqlite.create_species_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close species file
    species_file_id.close()
コード例 #2
0
ファイル: load-blast-data.py プロジェクト: GGFHF/TOA
def load_table_blast_5(conn, dataset_id, blast_file):
    '''
    '''

    # check if BLAST file is not empty
    try:
        blast_file_id = open(blast_file, mode='r', encoding='iso-8859-1')
    except Exception as e:
        raise xlib.ProgramException('F001', blast_file)
    record = blast_file_id.readline()
    if record == '':
        return

    # initialize the iteration counter
    iteration_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # create table "blast"
    xlib.Message.print(
        'verbose', 'Creating the table "blast" (if it does not exist) ...\n')
    xsqlite.create_blast(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # create the index on the table "blast"
    xlib.Message.print(
        'verbose',
        'Creating the index on the table "blast" (if it does not exist) ...\n')
    xsqlite.create_blast_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # delete files from table "blast" corresponding to the repository and dataset identification
    xlib.Message.print('verbose',
                       'Deleting previous rows from the table "blast" ...\n')
    xsqlite.delete_blast_rows(conn, dataset_id)
    xlib.Message.print('verbose', 'Rows are deleted.\n')

    # build the complee item tree from BLAST XML file
    tree = xml.etree.ElementTree.parse(blast_file)
    root = tree.getroot()

    # walk the tree and insert data into table "blast" for each iteration-hit-hsp
    for item_blastoutput_iterations in root.iter(tag='BlastOutput_iterations'):
        xlib.Message.print(
            'verbose',
            f'-> tag: {item_blastoutput_iterations.tag} - attrib: {item_blastoutput_iterations.attrib} - text: {item_blastoutput_iterations.text}\n'
        )

        # get items "Iteration"
        for item_iteration in item_blastoutput_iterations.iter(
                tag='Iteration'):
            xlib.Message.print(
                'verbose',
                f'---> tag: {item_iteration.tag} - attrib: {item_iteration.attrib} - text: {item_iteration.text}\n'
            )

            # initialize the row data dictionary
            row_dict = {}
            row_dict['dataset_id'] = dataset_id

            # add 1 to iteration counter
            iteration_counter += 1

            # initialize iteration data
            iteration_iter_num = 0
            iteration_query_def = ''

            # get data of item "Iteration_iter-num"
            for item_iteration_iter_num in item_iteration.iter(
                    tag='Iteration_iter-num'):
                xlib.Message.print(
                    'verbose',
                    f'-----> tag: {item_iteration_iter_num.tag} - attrib: {item_iteration_iter_num.attrib} - text: {item_iteration_iter_num.text}\n'
                )
                row_dict['iteration_iter_num'] = int(
                    item_iteration_iter_num.text)

            # get data of item "Iteration_query-def"
            for item_iteration_query_def in item_iteration.iter(
                    tag='Iteration_query-def'):
                xlib.Message.print(
                    'verbose',
                    f'-----> tag: {item_iteration_query_def.tag} - attrib: {item_iteration_query_def.attrib} - text: {item_iteration_query_def.text}\n'
                )
                row_dict['iteration_query_def'] = item_iteration_query_def.text

            # get items "Iteration_hits"
            for item_iteration_hits in item_iteration.iter(
                    tag='Iteration_hits'):
                xlib.Message.print(
                    'verbose',
                    f'-----> tag: {item_iteration_hits.tag} - attrib: {item_iteration_hits.attrib} - text: {item_iteration_hits.text}\n'
                )

                # get items "Hit"
                for item_hit in item_iteration_hits.iter(tag='Hit'):
                    xlib.Message.print(
                        'verbose',
                        f'-------> tag: {item_hit.tag} - attrib: {item_hit.attrib} - text: {item_hit.text}'
                    )

                    # initialize hit data
                    row_dict['hit_num'] = 0
                    row_dict['hit_id'] = xlib.get_na()
                    row_dict['hit_def'] = xlib.get_na()
                    row_dict['hit_accession'] = xlib.get_na()

                    # get data of item "Hit_num"
                    for item_hit_num in item_hit.iter(tag='Hit_num'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_num.tag} - attrib: {item_hit_num.attrib} - text: {item_hit_num.text}\n'
                        )
                        row_dict['hit_num'] = int(item_hit_num.text)

                    # get data of item "Hit_id"
                    for item_hit_id in item_hit.iter(tag='Hit_id'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_id.tag} - attrib: {item_hit_id.attrib} - text: {item_hit_id.text}\n'
                        )
                        row_dict['hit_id'] = item_hit_id.text

                    # get data of item "Hit_def"
                    for item_hit_def in item_hit.iter(tag='Hit_def'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_def.tag} - attrib: {item_hit_def.attrib} - text: {item_hit_def.text}\n'
                        )
                        try:
                            row_dict['hit_def'] = item_hit_def.text.replace(
                                "'", '|').replace(';', ',')
                        except:
                            row_dict['hit_def'] = item_hit_def.text

                    # get data of item "Hit_accession"
                    for item_hit_accession in item_hit.iter(
                            tag='Hit_accession'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_accession.tag} - attrib: {item_hit_accession.attrib} - text: {item_hit_accession.text}\n'
                        )
                        row_dict['hit_accession'] = item_hit_accession.text

                    # get items "Hit_hsps"
                    for item_hit_hsps in item_hit.iter(tag='Hit_hsps'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_hsps.tag} - attrib: {item_hit_hsps.attrib} - text: {item_hit_hsps.text}\n'
                        )

                        # get items "Hsp"
                        for item_hsp in item_hit.iter(tag='Hsp'):
                            xlib.Message.print(
                                'verbose',
                                f'-----------> tag: {item_hsp.tag} - attrib: {item_hsp.attrib} - text: {item_hsp.text}\n'
                            )

                            # initialize hsp data
                            row_dict['hsp_num'] = 0
                            row_dict['hsp_evalue'] = 0.
                            row_dict['hsp_identity'] = 0
                            row_dict['hsp_positive'] = 0
                            row_dict['hsp_gaps'] = 0
                            row_dict['hsp_align_len'] = 0
                            row_dict['hsp_qseq'] = ''

                            # get data of item "Hsp_num"
                            for item_hsp_num in item_hsp.iter(tag='Hsp_num'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_num.tag} - attrib: {item_hsp_num.attrib} - text: {item_hsp_num.text}\n'
                                )
                                row_dict['hsp_num'] = int(item_hsp_num.text)

                            # get data of item "Hsp_evalue"
                            for item_hsp_evalue in item_hsp.iter(
                                    tag='Hsp_evalue'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_evalue.tag} - attrib: {item_hsp_evalue.attrib} - text: {item_hsp_evalue.text}\n'
                                )
                                row_dict['hsp_evalue'] = float(
                                    item_hsp_evalue.text)

                            # get data of item "Hsp_identity"
                            for item_hsp_identity in item_hsp.iter(
                                    tag='Hsp_identity'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_identity.tag} - attrib: {item_hsp_identity.attrib} - text: {item_hsp_identity.text}\n'
                                )
                                row_dict['hsp_identity'] = int(
                                    item_hsp_identity.text)

                            # get data of item "Hsp_positive"
                            for item_hsp_positive in item_hsp.iter(
                                    tag='Hsp_positive'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_positive.tag} - attrib: {item_hsp_positive.attrib} - text: {item_hsp_positive.text}\n'
                                )
                                row_dict['hsp_positive'] = int(
                                    item_hsp_positive.text)

                            # get data of item "Hsp_gaps"
                            for item_hsp_gaps in item_hsp.iter(tag='Hsp_gaps'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_gaps.tag} - attrib: {item_hsp_gaps.attrib} - text: {item_hsp_gaps.text}\n'
                                )
                                row_dict['hsp_gaps'] = int(item_hsp_gaps.text)

                            # get data of item "Hsp_align-len"
                            for item_hsp_align_len in item_hsp.iter(
                                    tag='Hsp_align-len'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_align_len.tag} - attrib: {item_hsp_align_len.attrib} - text: {item_hsp_align_len.text}\n'
                                )
                                row_dict['hsp_align_len'] = int(
                                    item_hsp_align_len.text)

                            # get data of item "Hsp_qseq"
                            for item_hsp_qseq in item_hsp.iter(tag='Hsp_qseq'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_qseq.tag} - attrib: {item_hsp_qseq.attrib} - text: {item_hsp_qseq.text}\n'
                                )
                                row_dict['hsp_qseq'] = item_hsp_qseq.text

                            # insert data into table "blast"
                            xsqlite.insert_blast_row(conn, row_dict)
                            inserted_row_counter += 1

            # print iteration counter
            xlib.Message.print(
                'verbose',
                f'\rIterations: {iteration_counter} - Inserted rows: {inserted_row_counter}'
            )

    xlib.Message.print('verbose', '\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')
コード例 #3
0
ファイル: load-basic-data.py プロジェクト: GGFHF/TOA
def load_table_datasets(conn, dataset_file):
    '''
    '''

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # set the pattern of the data records
    # format: "repository_id";"dataset_id";"dataset_name";"ftp_adress"
    record_pattern = re.compile(r'^"(.*)";"(.*)";"(.*)";"(.*)"$')
    
    # drop table "datasets"
    xlib.Message.print('verbose', 'Droping the table "datasets" ...\n')
    xsqlite.drop_datasets(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')
    
    # create table "datasets"
    xlib.Message.print('verbose', 'Creating the table "datasets" ...\n')
    xsqlite.create_datasets(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # open the file of datasets
    if dataset_file.endswith('.gz'):
        try:
            dataset_file_id = gzip.open(dataset_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', dataset_file)
    else:
        try:
            dataset_file_id = open(dataset_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', dataset_file)

    # read the first record
    record = dataset_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.lstrip().startswith('#') and record.strip() != '':

            # initialize the row data dictionary
            row_dict = {}

            # extract data
            try:
                mo = record_pattern.match(record)
                row_dict['dataset_id'] = mo.group(1).strip().lower()
                row_dict['dataset_name'] = mo.group(2).strip()
                row_dict['repository_id'] = mo.group(3).strip().lower()
                row_dict['ftp_adress'] = mo.group(4).strip()
            except Exception as e:
                raise xlib.ProgramException('F006', os.path.basename(dataset_file), record_counter)

            # review null values of "ftp_adress"
            if row_dict['ftp_adress'] == '':
                row_dict['ftp_adress'] = xlib.get_na()

            # insert data into table "datasets"
            xsqlite.insert_datasets_row(conn, row_dict)
            inserted_row_counter += 1

        # print record counter
        xlib.Message.print('verbose', f'\rProcessed records of dataset file: {record_counter} - Inserted rows: {inserted_row_counter}')

        # read the next record
        record = dataset_file_id.readline()

    xlib.Message.print('verbose', '\n')
    
    # create the index on the table "datasets"
    xlib.Message.print('verbose', 'Creating the index on the table "datasets" ...\n')
    xsqlite.create_datasets_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close dataset file
    dataset_file_id.close()
コード例 #4
0
def load_genomic_features(conn, species_name, gff_file, gff_format):
    '''
    '''

    # create table "genomic_features" (if not exists)
    xlib.Message.print(
        'verbose',
        'Creating the table "genomic_features" (if it does not exist) ...\n')
    xsqlite.create_genomic_features(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # create index "genomic_features_index" with columns "dataset_id" and "gene_id"  (if not exists)
    xlib.Message.print(
        'verbose',
        'Creating the index on the table "genomic_features" (if it does not exist) ...\n'
    )
    xsqlite.create_genomic_features_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # delete files from table "genomic_features" corresponding to the dataset and species identifications
    xlib.Message.print(
        'verbose',
        'Deleting previous rows from the table "genomic_features" ...\n')
    xsqlite.delete_genomic_features_rows(conn, species_name)
    xlib.Message.print('verbose', 'Rows are deleted.\n')

    # open the GFF file
    if gff_file.endswith('.gz'):
        try:
            gff_file_id = gzip.open(gff_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', gff_file)
    else:
        try:
            gff_file_id = open(gff_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', gff_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # initialize the first header record control
    first_header_record = True

    # read the first record
    record = gff_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process the header records
        if record.startswith('#'):
            if first_header_record == True and gff_format == 'GFF3':
                if not record.startswith('##gff-version 3'):
                    raise xlib.ProgramException('F005',
                                                os.path.basename(gff_file),
                                                'GFF3')
            first_header_record = False

        # process data records
        else:

            # initialize the row data dictionary
            row_dict = {}
            row_dict['species_name'] = species_name

            # extract data
            # record format: seqid\tsource\ttype\tstart\tend\tscore\tstrand\tphase\tattributes
            data_list = []
            pos_1 = 0
            for pos_2 in [i for i, chr in enumerate(record) if chr == '\t']:
                data_list.append(record[pos_1:pos_2].strip())
                pos_1 = pos_2 + 1
            data_list.append(record[pos_1:].strip('\n').strip())
            try:
                row_dict['seq_id'] = data_list[0]
                row_dict['type'] = data_list[2]
                row_dict['start'] = data_list[3]
                row_dict['end'] = data_list[4]
                attributes = data_list[8]
            except Exception as e:
                raise xlib.ProgramException('F006', os.path.basename(gff_file),
                                            record_counter)

            # only the types "gene", "CDS" and "mRNA" have to be inserted in the table "genomic_features"
            if row_dict['type'] in ['gene', 'CDS', 'mRNA']:

                # check "start"
                try:
                    row_dict['start'] = int(row_dict['start'])
                except Exception as e:
                    raise xlib.ProgramException('D001', 'start',
                                                os.path.basename(gff_file),
                                                record_counter)

                # check "end"
                try:
                    row_dict['end'] = int(row_dict['end'])
                except Exception as e:
                    raise xlib.ProgramException('D001', 'stop',
                                                os.path.basename(gff_file),
                                                record_counter)

                # get "gene_id" data from "attributes"
                row_dict['gene_id'] = xlib.get_na()
                literal = 'GeneID:'
                pos_1 = attributes.find(literal)
                if pos_1 > -1:
                    pos_comma = attributes.find(',', pos_1 + len(literal) + 1)
                    pos_semicolon = attributes.find(';',
                                                    pos_1 + len(literal) + 1)
                    if pos_comma == -1:
                        pos_2 = pos_semicolon
                    elif pos_semicolon == -1:
                        pos_2 = pos_comma
                    else:
                        pos_2 = min(pos_comma, pos_semicolon)
                    row_dict['gene_id'] = attributes[pos_1 +
                                                     len(literal):pos_2]

                # get "genbank_id" data from "attributes"
                row_dict['genbank_id'] = xlib.get_na()
                literal = 'Genbank:'
                pos_1 = attributes.find(literal)
                if pos_1 > -1:
                    pos_comma = attributes.find(',', pos_1 + len(literal) + 1)
                    pos_semicolon = attributes.find(';',
                                                    pos_1 + len(literal) + 1)
                    if pos_comma == -1:
                        pos_2 = pos_semicolon
                    elif pos_semicolon == -1:
                        pos_2 = pos_comma
                    else:
                        pos_2 = min(pos_comma, pos_semicolon)
                    row_dict['genbank_id'] = attributes[pos_1 +
                                                        len(literal):pos_2]

                # get "gene" data from "attributes"
                row_dict['gene'] = xlib.get_na()
                literal = 'gene='
                pos_1 = attributes.find(literal)
                if pos_1 > -1:
                    pos_2 = attributes.find(';', pos_1 + len(literal) + 1)
                    row_dict['gene'] = attributes[pos_1 + len(literal):pos_2]

                # get "protein_id" data from "attributes"
                row_dict['protein_id'] = xlib.get_na()
                literal = 'protein_id='
                pos_1 = attributes.find(literal)
                if pos_1 > -1:
                    pos_2 = attributes.find(';', pos_1 + len(literal) + 1)
                    if pos_2 > -1:
                        row_dict['protein_id'] = attributes[pos_1 +
                                                            len(literal):pos_2]
                    else:
                        row_dict['protein_id'] = attributes[pos_1 +
                                                            len(literal):]

                # get "transcript_id" data from "attributes"
                row_dict['transcript_id'] = xlib.get_na()
                literal = 'transcript_id='
                pos_1 = attributes.find(literal)
                if pos_1 > -1:
                    pos_2 = attributes.find(';', pos_1 + len(literal) + 1)
                    if pos_2 > -1:
                        row_dict['transcript_id'] = attributes[pos_1 +
                                                               len(literal
                                                                   ):pos_2]
                    else:
                        row_dict['transcript_id'] = attributes[pos_1 +
                                                               len(literal):]

                # get "product" data from "attributes"
                row_dict['product'] = xlib.get_na()
                literal = 'product='
                pos_1 = attributes.find(literal)
                if pos_1 > -1:
                    pos_2 = attributes.find(';', pos_1 + len(literal) + 1)
                    row_dict['product'] = attributes[pos_1 +
                                                     len(literal):pos_2]

                # change quotation marks, semicolons and %2C in "product"
                row_dict['product'] = row_dict['product'].replace(
                    "'", '|').replace(';', ',').replace('%2C', ',')

                # insert data into table "genomic_features"
                xsqlite.insert_genomic_features_row(conn, row_dict)
                inserted_row_counter += 1

        # print record counter
        xlib.Message.print(
            'verbose',
            f'\rProcessed records of GFF file: {record_counter} - Inserted rows: {inserted_row_counter}'
        )

        # read the next record
        record = gff_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close GFF file
    gff_file_id.close()
コード例 #5
0
ファイル: load-go-data.py プロジェクト: GGFHF/TOA
def load_table_go_cross_references(conn, ec2go_file, kegg2go_file,
                                   metacyc2go_file, interpro2go_file):
    '''
    '''

    # drop table "go_cross_references" (if it exists)
    xlib.Message.print('verbose',
                       'Droping the table "go_cross_references" ...\n')
    xsqlite.drop_go_cross_references(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "go_cross_references"
    xlib.Message.print('verbose',
                       'Creating the table "go_cross_references" ...\n')
    xsqlite.create_go_cross_references(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # initialize the row data dictionary and the external database name and description
    row_dict = {}
    row_dict['external_db'] = 'ec'
    row_dict['external_desc'] = xlib.get_na()

    # open the ec2go file
    if ec2go_file.endswith('.gz'):
        try:
            ec2go_file_id = gzip.open(ec2go_file,
                                      mode='rt',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', ec2go_file)
    else:
        try:
            ec2go_file_id = open(ec2go_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', ec2go_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # read the first record
    record = ec2go_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.startswith('!'):

            # extract data
            # record format: ec_id > go_term ; go_id
            gt_position = record.find('>')
            semicolon_position = record.find(';')
            if gt_position == -1 or semicolon_position == -1 or gt_position > semicolon_position:
                raise xlib.ProgramException('F006',
                                            os.path.basename(ec2go_file),
                                            record_counter)
            row_dict['external_id'] = record[:gt_position].strip()
            row_dict['go_term'] = record[gt_position +
                                         1:semicolon_position].strip()
            row_dict['go_id'] = record[semicolon_position +
                                       1:].strip('\n').strip()

            # remove database name from text
            row_dict['go_id'] = row_dict['go_id'].replace('GO:', '')
            row_dict['go_term'] = row_dict['go_term'].replace('GO:', '')
            row_dict['external_id'] = row_dict['external_id'].replace(
                'EC:', '')

            # change quotation marks and semicolons in "go_term"
            row_dict['go_term'] = row_dict['go_term'].replace("'",
                                                              '|').replace(
                                                                  ';', ',')

            # insert data into table "go_cross_references"
            xsqlite.insert_go_cross_references_row(conn, row_dict)
            inserted_row_counter += 1

            # print record counter
            xlib.Message.print(
                'verbose',
                f'\rec2go file: {record_counter} processed records - Inserted rows: {inserted_row_counter}'
            )

        # read the next record
        record = ec2go_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # close ec2go file
    ec2go_file_id.close()

    # initialize the row data dictionary and the external database name and description
    row_dict = {}
    row_dict['external_db'] = 'kegg'
    row_dict['external_desc'] = xlib.get_na()

    # open the kegg2go file
    if kegg2go_file.endswith('.gz'):
        try:
            kegg2go_file_id = gzip.open(kegg2go_file,
                                        mode='rt',
                                        encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', kegg2go_file)
    else:
        try:
            kegg2go_file_id = open(kegg2go_file,
                                   mode='r',
                                   encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', kegg2go_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # read the first record
    record = kegg2go_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.startswith('!'):

            # extract data
            # record format: kegg_id > go_term ; go_id
            gt_position = record.find('>')
            semicolon_position = record.find(';')
            if gt_position == -1 or semicolon_position == -1 or gt_position > semicolon_position:
                raise xlib.ProgramException('F006',
                                            os.path.basename(kegg2go_file),
                                            record_counter)
            row_dict['external_id'] = record[:gt_position].strip()
            row_dict['go_term'] = record[gt_position +
                                         1:semicolon_position].strip()
            row_dict['go_id'] = record[semicolon_position +
                                       1:].strip('\n').strip()

            # remove database name from text
            row_dict['go_id'] = row_dict['go_id'].replace('GO:', '')
            row_dict['go_term'] = row_dict['go_term'].replace('GO:', '')
            row_dict['external_id'] = row_dict['external_id'].replace(
                'KEGG:', '')

            # change quotation marks and semicolons in "go_term"
            row_dict['go_term'] = row_dict['go_term'].replace("'",
                                                              '|').replace(
                                                                  ';', ',')

            # insert data into table "go_cross_references"
            xsqlite.insert_go_cross_references_row(conn, row_dict)
            inserted_row_counter += 1

            # print record counter
            xlib.Message.print(
                'verbose',
                f'\rkegg2go file: {record_counter} processed records - Inserted rows: {inserted_row_counter}'
            )

        # read the next record
        record = kegg2go_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # close kegg2go file
    kegg2go_file_id.close()

    # initialize the row data dictionary and the external database name and description
    row_dict = {}
    row_dict['external_db'] = 'metacyc'
    row_dict['external_desc'] = xlib.get_na()

    # open the metacyc2go file
    if metacyc2go_file.endswith('.gz'):
        try:
            metacyc2go_file_id = gzip.open(metacyc2go_file,
                                           mode='rt',
                                           encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', metacyc2go_file)
    else:
        try:
            metacyc2go_file_id = open(metacyc2go_file,
                                      mode='r',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', metacyc2go_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # read the first record
    record = metacyc2go_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.startswith('!'):

            # extract data
            # record format: metacyc_id > go_term ; go_id
            gt_position = record.find('>')
            semicolon_position = record.find(';')
            if gt_position == -1 or semicolon_position == -1 or gt_position > semicolon_position:
                raise xlib.ProgramException('F006',
                                            os.path.basename(metacyc2go_file),
                                            record_counter)
            row_dict['external_id'] = record[:gt_position].strip()
            row_dict['go_term'] = record[gt_position +
                                         1:semicolon_position].strip()
            row_dict['go_id'] = record[semicolon_position +
                                       1:].strip('\n').strip()

            # remove database name from text
            row_dict['go_id'] = row_dict['go_id'].replace('GO:', '')
            row_dict['go_term'] = row_dict['go_term'].replace('GO:', '')
            row_dict['external_id'] = row_dict['external_id'].replace(
                'MetaCyc:', '')

            # change quotation marks and semicolons in "go_term"
            row_dict['go_term'] = row_dict['go_term'].replace("'",
                                                              '|').replace(
                                                                  ';', ',')

            # insert data into table "go_cross_references"
            xsqlite.insert_go_cross_references_row(conn, row_dict)
            inserted_row_counter += 1

            # print record counter
            xlib.Message.print(
                'verbose',
                f'\rmetacyc2go file: {record_counter} processed records - Inserted rows: {inserted_row_counter}'
            )

        # read the next record
        record = metacyc2go_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # close metacyc2go file
    metacyc2go_file_id.close()

    # initialize the row data dictionary and the external database name
    row_dict = {}
    row_dict['external_db'] = 'interpro'

    # open the interpro file
    if interpro2go_file.endswith('.gz'):
        try:
            interpro2go_file_id = gzip.open(interpro2go_file,
                                            mode='rt',
                                            encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', interpro2go_file)
    else:
        try:
            interpro2go_file_id = open(interpro2go_file,
                                       mode='r',
                                       encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', interpro2go_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # read the first record
    record = interpro2go_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.startswith('!'):

            # extract data
            # record format: interpro_id interpro_desc > go_term ; go_id
            first_space_position = record.find(' ')
            gt_position = record.find('>')
            semicolon_position = record.find(';')
            if first_space_position == -1 or gt_position == -1 or semicolon_position == -1 or first_space_position > gt_position or gt_position > semicolon_position:
                raise xlib.ProgramException('F006',
                                            os.path.basename(interpro2go_file),
                                            record_counter)
            row_dict['external_id'] = record[:first_space_position].strip()
            row_dict['external_desc'] = record[first_space_position +
                                               1:gt_position].strip()
            row_dict['go_term'] = record[gt_position +
                                         1:semicolon_position].strip()
            row_dict['go_id'] = record[semicolon_position +
                                       1:].strip('\n').strip()

            # remove database name from text
            row_dict['go_id'] = row_dict['go_id'].replace('GO:', '')
            row_dict['go_term'] = row_dict['go_term'].replace('GO:', '')
            row_dict['external_id'] = row_dict['external_id'].replace(
                'InterPro:', '')

            # change quotation marks and semicolons in "go_term" and "external_desc"
            row_dict['go_term'] = row_dict['go_term'].replace("'",
                                                              '|').replace(
                                                                  ';', ',')
            row_dict['external_desc'] = row_dict['external_desc'].replace(
                "'", '|').replace(';', ',')

            # insert data into table "go_cross_references"
            xsqlite.insert_go_cross_references_row(conn, row_dict)
            inserted_row_counter += 1

            # print record counter
            xlib.Message.print(
                'verbose',
                f'\rinterpro2go file: {record_counter} processed records - Inserted rows: {inserted_row_counter}'
            )

        # read the next record
        record = interpro2go_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # close interpro2go file
    interpro2go_file_id.close()

    # create the index on the table "go_cross_references"
    xlib.Message.print(
        'verbose',
        'Creating the index on the table "go_cross_references" ...\n')
    xsqlite.create_go_cross_references_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')
コード例 #6
0
ファイル: load-go-data.py プロジェクト: GGFHF/TOA
def load_table_go_ontology(conn, ontology_file):
    '''
    '''

    # drop table "go_ontology" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "go_ontology" ...\n')
    xsqlite.drop_go_ontology(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "go_ontology"
    xlib.Message.print('verbose', 'Creating the table "go_ontology" ...\n')
    xsqlite.create_go_ontology(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # initialize the row data dictionary and the external database name and description
    row_dict = {}
    row_dict['external_db'] = 'ec'
    row_dict['external_desc'] = xlib.get_na()

    # open the ontology file
    if ontology_file.endswith('.gz'):
        try:
            ontology_file_id = gzip.open(ontology_file,
                                         mode='rt',
                                         encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', ontology_file)
    else:
        try:
            ontology_file_id = open(ontology_file,
                                    mode='r',
                                    encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', ontology_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # read the first record
    record = ontology_file_id.readline()

    # while there are records and they are the header
    while record != '' and not record.startswith('[Term]'):

        # add 1 to record counter
        record_counter += 1

        # print record counter
        xlib.Message.print(
            'verbose',
            f'\rOntology file: {record_counter} processed records - Inserted rows: {inserted_row_counter}'
        )

        # read the next record
        record = ontology_file_id.readline()

    # if there is a first term block
    if record.startswith('[Term]'):

        # while there are records
        while record != '':

            # add 1 to record counter
            record_counter += 1

            # print record counter
            xlib.Message.print(
                'verbose',
                f'\rOntology file: {record_counter} processed records - Inserted rows: {inserted_row_counter}'
            )

            # read the next record
            record = ontology_file_id.readline()

            # initialize the row dictionary
            row_dict = {}
            row_dict['go_id'] = ''
            row_dict['go_name'] = ''
            row_dict['namespace'] = ''
            alt_id_list = []

            # while there are records and they are term details
            while record != '' and not record.startswith('[Term]'):

                # add 1 to record counter
                record_counter += 1

                # get the GO identification
                if record.startswith('id:'):
                    row_dict['go_id'] = record[len('id: GO:'):].strip()

                # get the GO name
                if record.startswith('name:'):
                    row_dict['go_name'] = record[len('name:'):].strip()

                    # change quotation marks and semicolons in "go_name"
                    row_dict['go_name'] = row_dict['go_name'].replace(
                        "'", '|').replace(';', ',')

                # get the namespace
                if record.startswith('namespace:'):
                    row_dict['namespace'] = record[len('namespace:'):].strip()

                    # change quotation marks and semicolons in "namespace"
                    row_dict['namespace'] = row_dict['namespace'].replace(
                        "'", '|').replace(';', ',').replace('_', ' ')

                # get the alternative identificationnamespace
                if record.startswith('alt_id:'):
                    alt_id_list.append(record[len('alt_id: GO:'):].strip())

                # print record counter
                xlib.Message.print(
                    'verbose',
                    f'\rOntology file: {record_counter} processed records - Inserted rows: {inserted_row_counter}'
                )

                # read the next record
                record = ontology_file_id.readline()

                # break the loop when typedef sections start
                if record.startswith('[Typedef]'):
                    break

            # insert data into table "go_ontology"
            xsqlite.insert_go_ontology_row(conn, row_dict)
            inserted_row_counter += 1
            for alt_id in alt_id_list:
                row_dict['go_id'] = alt_id
                xsqlite.insert_go_ontology_row(conn, row_dict)
                inserted_row_counter += 1

            # print record counter
            xlib.Message.print(
                'verbose',
                f'\rOntology file: {record_counter} processed records - Inserted rows: {inserted_row_counter}'
            )

            # break the loop when typedef sections start
            if record.startswith('[Typedef]'):
                break

    xlib.Message.print('verbose', '\n')

    # close ontology file
    ontology_file_id.close()

    # create the index on the table "go_ontology"
    xlib.Message.print('verbose',
                       'Creating the index on the table "go_ontology" ...\n')
    xsqlite.create_go_ontology_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')
コード例 #7
0
def load_table_plaza_interpro(conn, dataset_id, species_id, interpro_file,
                              plaza_species_id_list):
    '''
    '''

    # create table "plaza_interpro" (if not exists)
    xlib.Message.print(
        'verbose',
        'Creating the table "plaza_interpro" (if it does not exist) ...\n')
    xsqlite.create_plaza_interpro(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # create index "plaza_interpro_index" with columns "dataset_id" and "gene_id"  (if not exists)
    xlib.Message.print(
        'verbose',
        'Creating the index on the table "plaza_interpro" (if it does not exist) ...\n'
    )
    xsqlite.create_plaza_interpro_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # delete files from table "plaza_interpro" corresponding to the dataset and species identifications
    xlib.Message.print(
        'verbose',
        'Deleting previous rows from the table "plaza_interpro" ...\n')
    xsqlite.delete_plaza_interpro_rows(conn, dataset_id, species_id)
    xlib.Message.print('verbose', 'Rows are deleted.\n')

    # open the InterPro file
    if interpro_file.endswith('.gz'):
        try:
            interpro_file_id = gzip.open(interpro_file,
                                         mode='rt',
                                         encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', interpro_file)
    else:
        try:
            interpro_file_id = open(interpro_file,
                                    mode='r',
                                    encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', interpro_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # initialize the header record control
    header_record = True

    # read the first record
    record = interpro_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process the header record for Gymno PLAZA 1.0
        if dataset_id in ['gymno_01'] and header_record:
            header_record = False

        # process data records
        else:

            # initialize the row data dictionary
            row_dict = {}
            row_dict['dataset_id'] = dataset_id

            # extract data Gymno PLAZA 1.0
            if dataset_id in ['gymno_01']:
                # record format: "id";"motif_id";"species";"gene_id";"start";"stop";"score";"comment";"desc"
                data_list = []
                begin = 0
                for end in [i for i, chr in enumerate(record) if chr == ';']:
                    data_list.append(record[begin:end].strip('"'))
                    begin = end + 1
                data_list.append(record[begin:].strip('\n').strip('"'))
                try:
                    row_dict['id'] = data_list[0]
                    row_dict['motif_id'] = data_list[1]
                    row_dict['plaza_species_id'] = data_list[2]
                    row_dict['gene_id'] = data_list[3]
                    row_dict['start'] = data_list[4]
                    row_dict['stop'] = data_list[5]
                    row_dict['score'] = data_list[6]
                    comment = data_list[7]
                    row_dict['desc'] = data_list[8]
                except Exception as e:
                    raise xlib.ProgramException(
                        'F006', os.path.basename(interpro_file),
                        record_counter)

            # extract data Dicots PLAZA 4.0 and Monocots PLAZA 4.0 (for non-comment records)
            elif not record.startswith('#') and dataset_id in [
                    'dicots_04', 'monocots_04'
            ]:
                # record format: gene_id\tspecies\tmotif_id\tdescription\tstart\tstop\tscore\tcomment
                data_list = []
                start = 0
                for end in [i for i, chr in enumerate(record) if chr == '\t']:
                    data_list.append(record[start:end].strip())
                    start = end + 1
                data_list.append(record[start:].strip('\n').strip())
                try:
                    row_dict['gene_id'] = data_list[0]
                    row_dict['plaza_species_id'] = data_list[1]
                    row_dict['motif_id'] = data_list[2]
                    row_dict['desc'] = data_list[3]
                    row_dict['start'] = data_list[4]
                    row_dict['stop'] = data_list[5]
                    row_dict['score'] = data_list[6]
                    comment = data_list[7]
                    row_dict['id'] = 0
                except Exception as e:
                    raise xlib.ProgramException(
                        'F006', os.path.basename(interpro_file),
                        record_counter)

            # if PLAZA species identification has value not null (for non-comment records)
            if not record.startswith(
                    '#') and row_dict['plaza_species_id'] != '':

                # check plaza_species_id
                if row_dict['plaza_species_id'] not in plaza_species_id_list:
                    raise xlib.ProgramException(
                        'L002', 'species', os.path.basename(interpro_file),
                        record_counter)

                # check "start"
                try:
                    row_dict['start'] = int(row_dict['start'])
                except Exception as e:
                    raise xlib.ProgramException(
                        'D001', 'start', os.path.basename(interpro_file),
                        record_counter)

                # check "end"
                try:
                    row_dict['stop'] = int(row_dict['stop'])
                except Exception as e:
                    raise xlib.ProgramException(
                        'D001', 'stop', os.path.basename(interpro_file),
                        record_counter)

                # check "score"
                try:
                    row_dict['score'] = float(row_dict['score'])
                except Exception as e:
                    raise xlib.ProgramException(
                        'D002', 'score', os.path.basename(interpro_file),
                        record_counter)

                # split "comment" in "source" and "domain_id"
                # "comment" format: source=x,domainId=x
                pos1 = comment.find('source=')
                if pos1 >= 0:
                    pos2 = comment.find(',domainId=')
                    row_dict['source'] = comment[pos1 + 7:pos2].strip()
                    row_dict['domain_id'] = comment[pos2 + 10:].strip()
                else:
                    row_dict['source'] = xlib.get_na()
                    row_dict['domain_id'] = xlib.get_na()

                # change quotation marks and semicolons in "desc"
                row_dict['desc'] = row_dict['desc'].replace("'", '|').replace(
                    ';', ',')

                # insert data into table "plaza_interpro"
                xsqlite.insert_plaza_interpro_row(conn, row_dict)
                inserted_row_counter += 1

            # print record counter
            xlib.Message.print(
                'verbose',
                f'\rProcessed records of InterPro file: {record_counter} - Inserted rows: {inserted_row_counter}'
            )

        # read the next record
        record = interpro_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close InterPro file
    interpro_file_id.close()
コード例 #8
0
def extract_gff_rnas(gff_file, gff_format, genome_file, rna_file, tvi_list):
    '''
    Extract RNA sequences from a GFF file and its corresponding genome FASTA file.
    '''

    # initialize RNA sequences per seq_id dictionary
    rna_seq_id_dict = {}

    # open the input GFF file
    if gff_file.endswith('.gz'):
        try:
            gff_file_id = gzip.open(gff_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', gff_file)
    else:
        try:
            gff_file_id = open(gff_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', gff_file)

    # initialize counters
    record_counter = 0
    rna_counter = 0

    # read the first record
    record = gff_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.startswith('#'):

            # extract data
            # record format: seq_id\tsource\ttype\tstart\tend\tscore\tstrand\tphase\tattributes
            data_list = []
            pos_1 = 0
            for pos_2 in [i for i, chr in enumerate(record) if chr == '\t']:
                data_list.append(record[pos_1:pos_2].strip())
                pos_1 = pos_2 + 1
            data_list.append(record[pos_1:].strip('\n').strip())
            try:
                seq_id = data_list[0]
                type = data_list[2]
                start = int(data_list[3])
                end = int(data_list[4])
                attributes = data_list[8]
            except Exception as e:
                raise xlib.ProgramException(e, 'F009',
                                            os.path.basename(gff_file),
                                            record_counter)

            # only the type "mRNA"is considerer
            if type == 'mRNA':

                # add 1 to RNA counter
                rna_counter += 1

                # get "gene" data from "attributes"
                gene = xlib.get_na()
                literal = 'gene='
                pos_1 = attributes.find(literal)
                if pos_1 > -1:
                    pos_2 = attributes.find(';', pos_1 + len(literal) + 1)
                    gene = attributes[pos_1 + len(literal):pos_2]

                # add RNA sequence to RNA sequences per seq_id dictionary
                if rna_seq_id_dict.get(seq_id, {}) == {}:
                    rna_seq_id_dict[seq_id] = {}
                key = f'{start}-{end}'
                rna_seq_id_dict[seq_id][key] = {
                    'start': start,
                    'end': end,
                    'gene': gene
                }

        # print record counter
        xlib.Message.print(
            'verbose',
            f'\rGFF file records... {record_counter:8d} - RNA seqs... {rna_counter:8d}'
        )

        # read the next record
        record = gff_file_id.readline()

    xlib.Message.print('verbose', '\n')

    for x in tvi_list:
        xlib.Message.print('trace',
                           f'RNA seq in {x}: {rna_seq_id_dict.get(x, {})}')

    # close the input GFF file
    gff_file_id.close()

    # open the genome file
    if genome_file.endswith('.gz'):
        try:
            genome_file_id = gzip.open(genome_file,
                                       mode='rt',
                                       encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', genome_file)
    else:
        try:
            genome_file_id = open(genome_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', genome_file)

    # open the output FASTA file with the RNA sequences
    if rna_file.endswith('.gz'):
        try:
            rna_file_id = gzip.open(rna_file,
                                    mode='wt',
                                    encoding='iso-8859-1',
                                    newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', rna_file)
    else:
        try:
            rna_file_id = open(rna_file,
                               mode='w',
                               encoding='iso-8859-1',
                               newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', rna_file)

    # initialize record counters
    genomic_seq_counter = 0
    rna_seq_counter = 0

    # read the first record of genome file
    record = genome_file_id.readline()

    # while there are records in genome file
    while record != '':

        # process the head record
        if record.startswith('>'):

            # add 1 to the read sequence counter
            genomic_seq_counter += 1

            # extract the identification
            space_pos = record[1:].find(' ')
            if space_pos > -1:
                id = record[1:space_pos + 1]
            else:
                id = record[1:].strip('\n')

            # initialize the sequence
            seq = ''

            # read the next record
            record = genome_file_id.readline()

        else:

            # control the FASTA format
            raise xlib.ProgramException('F006', genome_file, 'FASTA')

        # while there are records and they are sequence
        while record != '' and not record.startswith('>'):

            # concatenate the record to the sequence
            seq += record.strip()

            # read the next record of genome file
            record = genome_file_id.readline()

        # get RNA sequences corresponding to this genomic sequence
        rna_dict = rna_seq_id_dict.get(id, {})

        # if there are RNAs corresponding to this genomic sequence
        if rna_dict != {}:

            # for each RNA
            for key in rna_dict.keys():

                # get the RNA data
                start = rna_dict[key]['start']
                end = rna_dict[key]['end']
                gene = rna_dict[key]['gene']

                # write the identification record
                rna_file_id.write(
                    f'>seq_id: {id} - start: {start} - end: {end} - gene: {gene}\n'
                )

                # wite the sequence (start and end have 1-base offset in GFF file)
                rna_file_id.write(f'{seq[start - 1:end]}\n')

                # add 1 to the RNA sequence counter
                rna_seq_counter += 1

        # print the counters
        xlib.Message.print(
            'verbose',
            f'\rGenome seqs... {genomic_seq_counter:8d} - RNA seqs... {rna_seq_counter:8d}'
        )

    # close files
    genome_file_id.close()
    rna_file_id.close()

    # print OK message
    xlib.Message.print(
        'verbose',
        f'\nThe file {os.path.basename(rna_file)} containing FASTA RNA sequences in cDNA format cDNA is created.'
    )