コード例 #1
0
ファイル: g2g.py プロジェクト: juanfmacias3/g2gtools
def gtf2chain(chain_file, input_file, output_file, chain_genes=False):
    """

    :param chain_file:
    :param input_file:
    :param output_file:
    :param chain_genes:
    :return:
    """
    start = time.time()
    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))

    chain_file = g2g_fu.check_file(chain_file)
    input_file = g2g_fu.check_file(input_file)
    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("GTF FILE: {0}".format(input_file))
    LOG.info("FROM CHAIN FILE: {0}".format(chain_file))
    LOG.info("TO CHAIN FILE: {0}".format(output_file))

    temp_db = g2g_fu.gen_file_name("_g2gtempfile", output_file_dir, ".db3")

    gtf2db(input_file, temp_db)

    db2chain(chain_file, temp_db, output_file, chain_genes)

    g2g_fu.delete_file(temp_db)

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
コード例 #2
0
ファイル: g2g.py プロジェクト: churchill-lab/g2gtools
def gtf2chain(chain_file, input_file, output_file, chain_genes=False):
    """

    :param chain_file:
    :param input_file:
    :param output_file:
    :param chain_genes:
    :return:
    """
    start = time.time()
    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))

    chain_file = g2g_fu.check_file(chain_file)
    input_file = g2g_fu.check_file(input_file)
    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("GTF FILE: {0}".format(input_file))
    LOG.info("FROM CHAIN FILE: {0}".format(chain_file))
    LOG.info("TO CHAIN FILE: {0}".format(output_file))

    temp_db = g2g_fu.gen_file_name("_g2gtempfile", output_file_dir, ".db3")

    gtf2db(input_file, temp_db)

    db2chain(chain_file, temp_db, output_file, chain_genes)

    g2g_fu.delete_file(temp_db)

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
コード例 #3
0
ファイル: g2g.py プロジェクト: churchill-lab/g2gtools
def offset2chain(from_file, to_file, output_file):
    """
    Convert Seqnature offset files to new chain file.

    :param from_file: from Chromosome File (see docs)
    :param to_file: to Chromosome File (see docs)
    :param output_file: the output chain file
    """
    start = time.time()

    from_file = g2g_fu.check_file(from_file)
    to_file = g2g_fu.check_file(to_file)

    output_file_name = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file_name)

    LOG.info("FROM FILE: {0}".format(from_file))
    LOG.info("TO FILE: {0}".format(to_file))
    LOG.info("CHAIN FILE: {0}".format(output_file_name))

    LOG.info("Generating chain file...")

    try:
        chromosomes = offset_parse_chromosomes(from_file, to_file)

        for c, chromosome in chromosomes.iteritems():
            LOG.debug('Examining chromosome: {0}'.format(chromosome))
            if chromosome['file_path']:
                offset_chromosome_to_chain(chromosome, output_file)
            else:
                LOG.debug("No file for {0}, so skipping".format(chromosome))

        LOG.info("Chain file created")

    except Exception, e:
        raise G2GChainFileError("Unable to generate chain file")
コード例 #4
0
ファイル: g2g.py プロジェクト: juanfmacias3/g2gtools
def offset2chain(from_file, to_file, output_file):
    """
    Convert Seqnature offset files to new chain file.

    :param from_file: from Chromosome File (see docs)
    :param to_file: to Chromosome File (see docs)
    :param output_file: the output chain file
    """
    start = time.time()

    from_file = g2g_fu.check_file(from_file)
    to_file = g2g_fu.check_file(to_file)

    output_file_name = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file_name)

    LOG.info("FROM FILE: {0}".format(from_file))
    LOG.info("TO FILE: {0}".format(to_file))
    LOG.info("CHAIN FILE: {0}".format(output_file_name))

    LOG.info("Generating chain file...")

    try:
        chromosomes = offset_parse_chromosomes(from_file, to_file)

        for c, chromosome in chromosomes.iteritems():
            LOG.debug('Examining chromosome: {0}'.format(chromosome))
            if chromosome['file_path']:
                offset_chromosome_to_chain(chromosome, output_file)
            else:
                LOG.debug("No file for {0}, so skipping".format(chromosome))

        LOG.info("Chain file created")

    except Exception, e:
        raise G2GChainFileError("Unable to generate chain file")
コード例 #5
0
    for seqid in tb.contigs:
        processed_seqids[seqid] = False

    left = VCFtoChainInfo()
    right = VCFtoChainInfo()

    chain_info = {}

    if diploid:
        left.output_file = g2g_fu.prepend_before_extension(output_file, 'left')
        right.output_file = g2g_fu.prepend_before_extension(
            output_file, 'right')
        chain_info['left'] = left
        chain_info['right'] = right

        g2g_fu.delete_file(left.output_file)
        g2g_fu.delete_file(right.output_file)
    else:
        left.output_file = output_file
        chain_info['left'] = left

        g2g_fu.delete_file(left.output_file)

    try:
        all_chrom = [c for c in fasta_file.references]
        all_chrom_length = [n for n in fasta_file.lengths]
        all_vcffiles = [input_file] * len(all_chrom)
        all_sample_index = [sample_index] * len(all_chrom)
        all_chain_info = [chain_info] * len(all_chrom)
        all_diploid = [diploid] * len(all_chrom)
        all_passed = [passed] * len(all_chrom)
コード例 #6
0
ファイル: g2g.py プロジェクト: churchill-lab/g2gtools
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False):
    """

    :param fasta_file:
    :param chain_file:
    :param locations:
    :param output_file:
    :param bgzip:
    :param reverse:
    :return:
    """
    start = time.time()

    if not isinstance(fasta_file, FastaFile):
        fasta_file = g2g_fu.check_file(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = g2g_fu.check_file(chain_file)

    output_file = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file)
    g2g_fu.delete_index_files(output_file)

    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file))
    LOG.info("BGZIP: {0}".format(bgzip))
    LOG.info("REVERSE: {0}".format(reverse))

    if isinstance(fasta_file, FastaFile):
        fasta = fasta_file
    else:
        fasta = FastaFile(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = ChainIter(chain_file, reverse=reverse)

    seq_ids = []

    if locations:
        LOG.debug("Have locations")
        new_locations = []
        for l in locations:
            if isinstance(l, Location):
                new_locations.append(l)
            else:
                new_locations.append(parse_location(l))
            seq_ids.append(new_locations[-1].seqid)
        locations = new_locations
    else:
        LOG.debug("Calculating locations")
        locations = [parse_location("{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references]
        seq_ids = [a for a in fasta.references]

    temp_output_file = output_file

    if bgzip:
        if g2g_fu.get_extension(output_file) != 'gz':
            output_file = "{0}.gz".format(output_file)
        else:
            temp_output_file = temp_output_file[:-3]

    fasta_out = open(temp_output_file, "w")

    LOG.info("Transforming...")

    chr_info = {}

    try:
        # will need a better way, but for now...
        LOG.info("Parsing chain file...")
        for line in chain_file:
            if len(line) > 7:
                LOG.debug("Adding chromosome {0}".format(chain_file.current_chain_header[1]))
                chr_info[chain_file.current_chain_header[1]] = {'from_size': line[2], 'from_start': line[4], 'from_end': line[5],
                                  'to_size': line[7], 'to_start': line[9], 'to_end': line[10],
                                  'header_chain':chain_file.current_chain_header, 'lines': []}
            else:
                chr_info[chain_file.current_chain_header[1]]['lines'].append(line)

        LOG.info("Chain file parsed")

        insertion_bases = 0
        deletion_bases = 0

        for location in locations:
            LOG.info("Processing chromosome={0}".format(location.seqid))
            LOG.debug("Location: {0}".format(location))

            chrom_size_from = chr_info[location.seqid]['from_size']
            chrom_size_to = chr_info[location.seqid]['to_size']

            last_pos = chr_info[location.seqid]['from_start']
            new_sequence = StringIO()
            chain_file.reset()

            for chain_line in chr_info[location.seqid]['lines']:
                LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line))

                if len(chain_line) == 1:
                    # last line
                    fragment = chain_line[0]

                    partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment)
                    new_sequence.write(str(partial_seq))

                    if len(new_sequence.getvalue()) < chrom_size_to:
                        LOG.warn("Length's do not match, chromosome length in chain: {0}, sequence length: {1}".format(chrom_size_to, len(new_sequence.getvalue())))

                    fasta_out.write(">{0} {1}:{2}-{3}\n".format(location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to))

                    for l in wrap_sequence(new_sequence.getvalue()):
                        fasta_out.write(l.strip())
                        fasta_out.write('\n')

                    break

                else:

                    # fragment_size dt_size dq_size same_bases dt_bases dq_bases

                    fragment = chain_line[0]
                    dt = chain_line[1 if not reverse else 2]
                    dq = chain_line[2 if not reverse else 1]
                    same = chain_line[3]
                    dt_bases = chain_line[4 if not reverse else 5]
                    dq_bases = chain_line[5 if not reverse else 4]

                    partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment)
                    new_sequence.write(partial_seq)

                    if dq > 0:
                        # insertion
                        LOG.debug("INSERTION")
                        new_sequence.write(dq_bases)
                        LOG.debug("{0}:{1}-{2} (Length: {3})".format(location.seqid, last_pos, last_pos + fragment, len(partial_seq)))
                        if len(partial_seq) > 100:
                            LOG.debug("{0}...{1}".format(partial_seq[:10], partial_seq[-10:]))
                        else:
                            LOG.debug(partial_seq)
                        LOG.debug("Adding {0}".format(dq_bases))
                        LOG.debug("SAME={0}, {1}".format(same, partial_seq[-(len(same)):]))

                        insertion_bases += dq

                    if dt > 0:
                        # deletion
                        LOG.debug("DELETION")
                        last_pos += dt
                        LOG.debug("skipping ahead {0} bases".format(dt))

                        deletion_bases += dt

                    last_pos += fragment

                    LOG.debug("LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}".format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases)))

        # bgzip and index
        if bgzip:
            LOG.info("Compressing and indexing...")
            g2g_fu.bgzip_index(temp_output_file, output_file, 'fa')

    except G2GLocationError, le:
        LOG.debug("Unable to parse location, {0}".format(le.message))
        raise le
コード例 #7
0
ファイル: g2g.py プロジェクト: juanfmacias3/g2gtools
def fasta_transform(fasta_file,
                    chain_file,
                    locations,
                    output_file,
                    bgzip=False,
                    reverse=False):
    """

    :param fasta_file:
    :param chain_file:
    :param locations:
    :param output_file:
    :param bgzip:
    :param reverse:
    :return:
    """
    start = time.time()

    if not isinstance(fasta_file, FastaFile):
        fasta_file = g2g_fu.check_file(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = g2g_fu.check_file(chain_file)

    output_file = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file)
    g2g_fu.delete_index_files(output_file)

    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file))
    LOG.info("BGZIP: {0}".format(bgzip))
    LOG.info("REVERSE: {0}".format(reverse))

    if isinstance(fasta_file, FastaFile):
        fasta = fasta_file
    else:
        fasta = FastaFile(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = ChainIter(chain_file, reverse=reverse)

    seq_ids = []

    if locations:
        LOG.debug("Have locations")
        new_locations = []
        for l in locations:
            if isinstance(l, Location):
                new_locations.append(l)
            else:
                new_locations.append(parse_location(l))
            seq_ids.append(new_locations[-1].seqid)
        locations = new_locations
    else:
        LOG.debug("Calculating locations")
        locations = [
            parse_location(
                "{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1)
            for a in fasta.references
        ]
        seq_ids = [a for a in fasta.references]

    temp_output_file = output_file

    if bgzip:
        if g2g_fu.get_extension(output_file) != 'gz':
            output_file = "{0}.gz".format(output_file)
        else:
            temp_output_file = temp_output_file[:-3]

    fasta_out = open(temp_output_file, "w")

    LOG.info("Transforming...")

    chr_info = {}

    try:
        # will need a better way, but for now...
        LOG.info("Parsing chain file...")
        for line in chain_file:
            if len(line) > 7:
                LOG.debug("Adding chromosome {0}".format(
                    chain_file.current_chain_header[1]))
                chr_info[chain_file.current_chain_header[1]] = {
                    'from_size': line[2],
                    'from_start': line[4],
                    'from_end': line[5],
                    'to_size': line[7],
                    'to_start': line[9],
                    'to_end': line[10],
                    'header_chain': chain_file.current_chain_header,
                    'lines': []
                }
            else:
                chr_info[chain_file.current_chain_header[1]]['lines'].append(
                    line)

        LOG.info("Chain file parsed")

        insertion_bases = 0
        deletion_bases = 0

        for location in locations:
            LOG.info("Processing chromosome={0}".format(location.seqid))
            LOG.debug("Location: {0}".format(location))

            chrom_size_from = chr_info[location.seqid]['from_size']
            chrom_size_to = chr_info[location.seqid]['to_size']

            last_pos = chr_info[location.seqid]['from_start']
            new_sequence = StringIO()
            chain_file.reset()

            for chain_line in chr_info[location.seqid]['lines']:
                LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no,
                                                     chain_line))

                if len(chain_line) == 1:
                    # last line
                    fragment = chain_line[0]

                    partial_seq = fasta.fetch(location.seqid, last_pos,
                                              last_pos + fragment)
                    new_sequence.write(str(partial_seq))

                    if len(new_sequence.getvalue()) < chrom_size_to:
                        LOG.warn(
                            "Length's do not match, chromosome length in chain: {0}, sequence length: {1}"
                            .format(chrom_size_to,
                                    len(new_sequence.getvalue())))

                    fasta_out.write(">{0} {1}:{2}-{3}\n".format(
                        location.seqid, location.seqid,
                        chr_info[location.seqid]['from_start'] + 1,
                        chrom_size_to))

                    for l in wrap_sequence(new_sequence.getvalue()):
                        fasta_out.write(l.strip())
                        fasta_out.write('\n')

                    break

                else:

                    # fragment_size dt_size dq_size same_bases dt_bases dq_bases

                    fragment = chain_line[0]
                    dt = chain_line[1 if not reverse else 2]
                    dq = chain_line[2 if not reverse else 1]
                    same = chain_line[3]
                    dt_bases = chain_line[4 if not reverse else 5]
                    dq_bases = chain_line[5 if not reverse else 4]

                    partial_seq = fasta.fetch(location.seqid, last_pos,
                                              last_pos + fragment)
                    new_sequence.write(partial_seq)

                    if dq > 0:
                        # insertion
                        LOG.debug("INSERTION")
                        new_sequence.write(dq_bases)
                        LOG.debug("{0}:{1}-{2} (Length: {3})".format(
                            location.seqid, last_pos, last_pos + fragment,
                            len(partial_seq)))
                        if len(partial_seq) > 100:
                            LOG.debug("{0}...{1}".format(
                                partial_seq[:10], partial_seq[-10:]))
                        else:
                            LOG.debug(partial_seq)
                        LOG.debug("Adding {0}".format(dq_bases))
                        LOG.debug("SAME={0}, {1}".format(
                            same, partial_seq[-(len(same)):]))

                        insertion_bases += dq

                    if dt > 0:
                        # deletion
                        LOG.debug("DELETION")
                        last_pos += dt
                        LOG.debug("skipping ahead {0} bases".format(dt))

                        deletion_bases += dt

                    last_pos += fragment

                    LOG.debug(
                        "LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}"
                        .format(last_pos, insertion_bases, deletion_bases,
                                (insertion_bases - deletion_bases)))

        # bgzip and index
        if bgzip:
            LOG.info("Compressing and indexing...")
            g2g_fu.bgzip_index(temp_output_file, output_file, 'fa')

    except G2GLocationError, le:
        LOG.debug("Unable to parse location, {0}".format(le.message))
        raise le
コード例 #8
0
ファイル: gtf_db.py プロジェクト: churchill-lab/g2gtools
def gtf2db(input_file, output_file):
    """
    Convert a GTF file into SQLite

    :param input_file: the GTF file to convert
    :param output_file: The generated database file
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file, 'r')
    output_file = g2g_fu.check_file(output_file, 'w')

    g2g_fu.delete_file(output_file)

    LOG.info("GTF FILE: {0}".format(input_file))
    LOG.info("DB File: {0}".format(output_file))

    conn = sqlite3.connect(output_file)
    c = conn.cursor()

    LOG.debug("Generating tables")
    c.execute(SQL_CREATE_GTF_TABLE)
    c.execute(SQL_CREATE_GTF_LOOKUP_TABLE)
    c.execute(SQL_CREATE_GTF_SOURCES_TABLE)
    c.execute(SQL_CREATE_GTF_TYPES_TABLE)
    c.execute(SQL_CREATE_GTF_ATTRIBUTES_TABLE)



    gtf_types = {}
    gtf_sources = {}
    gtf_attributes = {}

    LOG.info("Parsing GTF file...")

    gtf_file = GTF(input_file)

    counter = 0

    for record in gtf_file:
        if counter and counter % 100000 == 0:
            LOG.info("Processed {0:,} records".format(counter))

        if record.type not in gtf_types:
            _type_key = len(gtf_types.keys())
            gtf_types[record.type] = _type_key
        else:
            _type_key = gtf_types[record.type]

        if record.source not in gtf_sources:
            _source_key = len(gtf_sources.keys())
            gtf_sources[record.source] = _source_key
        else:
            _source_key = gtf_sources[record.source]

        strand = 0
        if record.strand in ['+', '-']:
            strand = 1 if record.strand == '+' else -1

        gene_id = record.attributes['gene_id']
        transcript_id = record.attributes['transcript_id'] if 'transcript_id' in record.attributes else None
        ensembl_id = None

        if record.type == 'gene':
            ensembl_id = record.attributes['gene_id']
        elif record.type == 'transcript':
            ensembl_id = record.attributes['transcript_id']
        elif record.type == 'exon':
            ensembl_id = record.attributes['exon_id']
        else:
            ensembl_id = record.attributes['protein_id'] if 'protein_id' in record.attributes else None

        c.execute(SQL_INSERT_GTF_TABLE, (gene_id, transcript_id, ensembl_id, record.seqid, record.start, record.end, strand, record.score, _source_key, _type_key, record.frame))
        gtf_key = c.lastrowid

        for attribute, value in record.attributes.iteritems():
            if attribute not in ['gene_id', 'transcript_id', 'exon_id']:
                if attribute not in gtf_attributes:
                    _attribute_key = len(gtf_attributes.keys())
                    gtf_attributes[attribute] = _attribute_key
                else:
                    _attribute_key = gtf_attributes[attribute]

                c.execute(SQL_INSERT_GTF_LOOKUP_TABLE, (gtf_key, _attribute_key, value))

        counter += 1

    # save (commit) the changes
    conn.commit()

    for source, _key in gtf_sources.iteritems():
        c.execute(SQL_INSERT_GTF_SOURCES_TABLE, (_key, source))
        conn.commit()

    for type, _key in gtf_types.iteritems():
        c.execute(SQL_INSERT_GTF_TYPES_TABLE, (_key, type))
        conn.commit()

    for attribute, _key in gtf_attributes.iteritems():
        c.execute(SQL_INSERT_GTF_ATTRIBUTES_TABLE, (_key, attribute))
        conn.commit()

    LOG.info("GTF File parsed")

    LOG.info("Finalizing database...")

    for sql in SQL_INDICES_GTF:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_LOOKUP:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_TYPES:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_SOURCES:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_ATTRIBUTES:
        LOG.debug(sql)
        c.execute(sql)

    LOG.info("Database created")

    # close connection
    conn.close()

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
コード例 #9
0
ファイル: vcf2chain.py プロジェクト: churchill-lab/g2gtools
    for seqid in tb.contigs:
        processed_seqids[seqid] = False

    left = VCFtoChainInfo()
    right = VCFtoChainInfo()

    chain_info = {}

    if diploid:
        left.output_file = g2g_fu.prepend_before_extension(output_file, 'left')
        right.output_file = g2g_fu.prepend_before_extension(output_file, 'right')
        chain_info['left'] = left
        chain_info['right'] = right

        g2g_fu.delete_file(left.output_file)
        g2g_fu.delete_file(right.output_file)
    else:
        left.output_file = output_file
        chain_info['left'] = left

        g2g_fu.delete_file(left.output_file)

    try:
        all_chrom = [c for c in fasta_file.references]
        all_chrom_length = [n for n in fasta_file.lengths]
        all_vcffiles = [input_file] * len(all_chrom)
        all_sample_index = [sample_index] * len(all_chrom)
        all_chain_info = [chain_info] * len(all_chrom)
        all_diploid = [diploid] * len(all_chrom)
        all_passed = [passed] * len(all_chrom)
コード例 #10
0
def gtf2db(input_file, output_file):
    """
    Convert a GTF file into SQLite

    :param input_file: the GTF file to convert
    :param output_file: The generated database file
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file, 'r')
    output_file = g2g_fu.check_file(output_file, 'w')

    g2g_fu.delete_file(output_file)

    LOG.info("GTF FILE: {0}".format(input_file))
    LOG.info("DB File: {0}".format(output_file))

    conn = sqlite3.connect(output_file)
    c = conn.cursor()

    LOG.debug("Generating tables")
    c.execute(SQL_CREATE_GTF_TABLE)
    c.execute(SQL_CREATE_GTF_LOOKUP_TABLE)
    c.execute(SQL_CREATE_GTF_SOURCES_TABLE)
    c.execute(SQL_CREATE_GTF_TYPES_TABLE)
    c.execute(SQL_CREATE_GTF_ATTRIBUTES_TABLE)

    gtf_types = {}
    gtf_sources = {}
    gtf_attributes = {}

    LOG.info("Parsing GTF file...")

    gtf_file = GTF(input_file)

    counter = 0

    for record in gtf_file:
        if counter and counter % 100000 == 0:
            LOG.info("Processed {0:,} records".format(counter))

        if record.type not in gtf_types:
            _type_key = len(gtf_types.keys())
            gtf_types[record.type] = _type_key
        else:
            _type_key = gtf_types[record.type]

        if record.source not in gtf_sources:
            _source_key = len(gtf_sources.keys())
            gtf_sources[record.source] = _source_key
        else:
            _source_key = gtf_sources[record.source]

        strand = 0
        if record.strand in ['+', '-']:
            strand = 1 if record.strand == '+' else -1

        gene_id = record.attributes['gene_id']
        transcript_id = record.attributes[
            'transcript_id'] if 'transcript_id' in record.attributes else None
        ensembl_id = None

        if record.type == 'gene':
            ensembl_id = record.attributes['gene_id']
        elif record.type == 'transcript':
            ensembl_id = record.attributes['transcript_id']
        elif record.type == 'exon':
            ensembl_id = record.attributes['exon_id']
        else:
            ensembl_id = record.attributes[
                'protein_id'] if 'protein_id' in record.attributes else None

        c.execute(SQL_INSERT_GTF_TABLE,
                  (gene_id, transcript_id, ensembl_id, record.seqid,
                   record.start, record.end, strand, record.score, _source_key,
                   _type_key, record.frame))
        gtf_key = c.lastrowid

        for attribute, value in record.attributes.iteritems():
            if attribute not in ['gene_id', 'transcript_id', 'exon_id']:
                if attribute not in gtf_attributes:
                    _attribute_key = len(gtf_attributes.keys())
                    gtf_attributes[attribute] = _attribute_key
                else:
                    _attribute_key = gtf_attributes[attribute]

                c.execute(SQL_INSERT_GTF_LOOKUP_TABLE,
                          (gtf_key, _attribute_key, value))

        counter += 1

    # save (commit) the changes
    conn.commit()

    for source, _key in gtf_sources.iteritems():
        c.execute(SQL_INSERT_GTF_SOURCES_TABLE, (_key, source))
        conn.commit()

    for type, _key in gtf_types.iteritems():
        c.execute(SQL_INSERT_GTF_TYPES_TABLE, (_key, type))
        conn.commit()

    for attribute, _key in gtf_attributes.iteritems():
        c.execute(SQL_INSERT_GTF_ATTRIBUTES_TABLE, (_key, attribute))
        conn.commit()

    LOG.info("GTF File parsed")

    LOG.info("Finalizing database...")

    for sql in SQL_INDICES_GTF:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_LOOKUP:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_TYPES:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_SOURCES:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_ATTRIBUTES:
        LOG.debug(sql)
        c.execute(sql)

    LOG.info("Database created")

    # close connection
    conn.close()

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))