def gtf2chain(chain_file, input_file, output_file, chain_genes=False): """ :param chain_file: :param input_file: :param output_file: :param chain_genes: :return: """ start = time.time() LOG.info("Execution complete: {0}".format(format_time(start, time.time()))) chain_file = g2g_fu.check_file(chain_file) input_file = g2g_fu.check_file(input_file) output_file = g2g_fu.check_file(output_file, 'w') output_file_dir = os.path.dirname(output_file) LOG.info("GTF FILE: {0}".format(input_file)) LOG.info("FROM CHAIN FILE: {0}".format(chain_file)) LOG.info("TO CHAIN FILE: {0}".format(output_file)) temp_db = g2g_fu.gen_file_name("_g2gtempfile", output_file_dir, ".db3") gtf2db(input_file, temp_db) db2chain(chain_file, temp_db, output_file, chain_genes) g2g_fu.delete_file(temp_db) LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def offset2chain(from_file, to_file, output_file): """ Convert Seqnature offset files to new chain file. :param from_file: from Chromosome File (see docs) :param to_file: to Chromosome File (see docs) :param output_file: the output chain file """ start = time.time() from_file = g2g_fu.check_file(from_file) to_file = g2g_fu.check_file(to_file) output_file_name = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file_name) LOG.info("FROM FILE: {0}".format(from_file)) LOG.info("TO FILE: {0}".format(to_file)) LOG.info("CHAIN FILE: {0}".format(output_file_name)) LOG.info("Generating chain file...") try: chromosomes = offset_parse_chromosomes(from_file, to_file) for c, chromosome in chromosomes.iteritems(): LOG.debug('Examining chromosome: {0}'.format(chromosome)) if chromosome['file_path']: offset_chromosome_to_chain(chromosome, output_file) else: LOG.debug("No file for {0}, so skipping".format(chromosome)) LOG.info("Chain file created") except Exception, e: raise G2GChainFileError("Unable to generate chain file")
for seqid in tb.contigs: processed_seqids[seqid] = False left = VCFtoChainInfo() right = VCFtoChainInfo() chain_info = {} if diploid: left.output_file = g2g_fu.prepend_before_extension(output_file, 'left') right.output_file = g2g_fu.prepend_before_extension( output_file, 'right') chain_info['left'] = left chain_info['right'] = right g2g_fu.delete_file(left.output_file) g2g_fu.delete_file(right.output_file) else: left.output_file = output_file chain_info['left'] = left g2g_fu.delete_file(left.output_file) try: all_chrom = [c for c in fasta_file.references] all_chrom_length = [n for n in fasta_file.lengths] all_vcffiles = [input_file] * len(all_chrom) all_sample_index = [sample_index] * len(all_chrom) all_chain_info = [chain_info] * len(all_chrom) all_diploid = [diploid] * len(all_chrom) all_passed = [passed] * len(all_chrom)
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False): """ :param fasta_file: :param chain_file: :param locations: :param output_file: :param bgzip: :param reverse: :return: """ start = time.time() if not isinstance(fasta_file, FastaFile): fasta_file = g2g_fu.check_file(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = g2g_fu.check_file(chain_file) output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) g2g_fu.delete_index_files(output_file) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("OUTPUT FILE: {0}".format(output_file)) LOG.info("BGZIP: {0}".format(bgzip)) LOG.info("REVERSE: {0}".format(reverse)) if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta = FastaFile(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = ChainIter(chain_file, reverse=reverse) seq_ids = [] if locations: LOG.debug("Have locations") new_locations = [] for l in locations: if isinstance(l, Location): new_locations.append(l) else: new_locations.append(parse_location(l)) seq_ids.append(new_locations[-1].seqid) locations = new_locations else: LOG.debug("Calculating locations") locations = [parse_location("{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references] seq_ids = [a for a in fasta.references] temp_output_file = output_file if bgzip: if g2g_fu.get_extension(output_file) != 'gz': output_file = "{0}.gz".format(output_file) else: temp_output_file = temp_output_file[:-3] fasta_out = open(temp_output_file, "w") LOG.info("Transforming...") chr_info = {} try: # will need a better way, but for now... LOG.info("Parsing chain file...") for line in chain_file: if len(line) > 7: LOG.debug("Adding chromosome {0}".format(chain_file.current_chain_header[1])) chr_info[chain_file.current_chain_header[1]] = {'from_size': line[2], 'from_start': line[4], 'from_end': line[5], 'to_size': line[7], 'to_start': line[9], 'to_end': line[10], 'header_chain':chain_file.current_chain_header, 'lines': []} else: chr_info[chain_file.current_chain_header[1]]['lines'].append(line) LOG.info("Chain file parsed") insertion_bases = 0 deletion_bases = 0 for location in locations: LOG.info("Processing chromosome={0}".format(location.seqid)) LOG.debug("Location: {0}".format(location)) chrom_size_from = chr_info[location.seqid]['from_size'] chrom_size_to = chr_info[location.seqid]['to_size'] last_pos = chr_info[location.seqid]['from_start'] new_sequence = StringIO() chain_file.reset() for chain_line in chr_info[location.seqid]['lines']: LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line)) if len(chain_line) == 1: # last line fragment = chain_line[0] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(str(partial_seq)) if len(new_sequence.getvalue()) < chrom_size_to: LOG.warn("Length's do not match, chromosome length in chain: {0}, sequence length: {1}".format(chrom_size_to, len(new_sequence.getvalue()))) fasta_out.write(">{0} {1}:{2}-{3}\n".format(location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to)) for l in wrap_sequence(new_sequence.getvalue()): fasta_out.write(l.strip()) fasta_out.write('\n') break else: # fragment_size dt_size dq_size same_bases dt_bases dq_bases fragment = chain_line[0] dt = chain_line[1 if not reverse else 2] dq = chain_line[2 if not reverse else 1] same = chain_line[3] dt_bases = chain_line[4 if not reverse else 5] dq_bases = chain_line[5 if not reverse else 4] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(partial_seq) if dq > 0: # insertion LOG.debug("INSERTION") new_sequence.write(dq_bases) LOG.debug("{0}:{1}-{2} (Length: {3})".format(location.seqid, last_pos, last_pos + fragment, len(partial_seq))) if len(partial_seq) > 100: LOG.debug("{0}...{1}".format(partial_seq[:10], partial_seq[-10:])) else: LOG.debug(partial_seq) LOG.debug("Adding {0}".format(dq_bases)) LOG.debug("SAME={0}, {1}".format(same, partial_seq[-(len(same)):])) insertion_bases += dq if dt > 0: # deletion LOG.debug("DELETION") last_pos += dt LOG.debug("skipping ahead {0} bases".format(dt)) deletion_bases += dt last_pos += fragment LOG.debug("LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}".format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases))) # bgzip and index if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(temp_output_file, output_file, 'fa') except G2GLocationError, le: LOG.debug("Unable to parse location, {0}".format(le.message)) raise le
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False): """ :param fasta_file: :param chain_file: :param locations: :param output_file: :param bgzip: :param reverse: :return: """ start = time.time() if not isinstance(fasta_file, FastaFile): fasta_file = g2g_fu.check_file(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = g2g_fu.check_file(chain_file) output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) g2g_fu.delete_index_files(output_file) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("OUTPUT FILE: {0}".format(output_file)) LOG.info("BGZIP: {0}".format(bgzip)) LOG.info("REVERSE: {0}".format(reverse)) if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta = FastaFile(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = ChainIter(chain_file, reverse=reverse) seq_ids = [] if locations: LOG.debug("Have locations") new_locations = [] for l in locations: if isinstance(l, Location): new_locations.append(l) else: new_locations.append(parse_location(l)) seq_ids.append(new_locations[-1].seqid) locations = new_locations else: LOG.debug("Calculating locations") locations = [ parse_location( "{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references ] seq_ids = [a for a in fasta.references] temp_output_file = output_file if bgzip: if g2g_fu.get_extension(output_file) != 'gz': output_file = "{0}.gz".format(output_file) else: temp_output_file = temp_output_file[:-3] fasta_out = open(temp_output_file, "w") LOG.info("Transforming...") chr_info = {} try: # will need a better way, but for now... LOG.info("Parsing chain file...") for line in chain_file: if len(line) > 7: LOG.debug("Adding chromosome {0}".format( chain_file.current_chain_header[1])) chr_info[chain_file.current_chain_header[1]] = { 'from_size': line[2], 'from_start': line[4], 'from_end': line[5], 'to_size': line[7], 'to_start': line[9], 'to_end': line[10], 'header_chain': chain_file.current_chain_header, 'lines': [] } else: chr_info[chain_file.current_chain_header[1]]['lines'].append( line) LOG.info("Chain file parsed") insertion_bases = 0 deletion_bases = 0 for location in locations: LOG.info("Processing chromosome={0}".format(location.seqid)) LOG.debug("Location: {0}".format(location)) chrom_size_from = chr_info[location.seqid]['from_size'] chrom_size_to = chr_info[location.seqid]['to_size'] last_pos = chr_info[location.seqid]['from_start'] new_sequence = StringIO() chain_file.reset() for chain_line in chr_info[location.seqid]['lines']: LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line)) if len(chain_line) == 1: # last line fragment = chain_line[0] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(str(partial_seq)) if len(new_sequence.getvalue()) < chrom_size_to: LOG.warn( "Length's do not match, chromosome length in chain: {0}, sequence length: {1}" .format(chrom_size_to, len(new_sequence.getvalue()))) fasta_out.write(">{0} {1}:{2}-{3}\n".format( location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to)) for l in wrap_sequence(new_sequence.getvalue()): fasta_out.write(l.strip()) fasta_out.write('\n') break else: # fragment_size dt_size dq_size same_bases dt_bases dq_bases fragment = chain_line[0] dt = chain_line[1 if not reverse else 2] dq = chain_line[2 if not reverse else 1] same = chain_line[3] dt_bases = chain_line[4 if not reverse else 5] dq_bases = chain_line[5 if not reverse else 4] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(partial_seq) if dq > 0: # insertion LOG.debug("INSERTION") new_sequence.write(dq_bases) LOG.debug("{0}:{1}-{2} (Length: {3})".format( location.seqid, last_pos, last_pos + fragment, len(partial_seq))) if len(partial_seq) > 100: LOG.debug("{0}...{1}".format( partial_seq[:10], partial_seq[-10:])) else: LOG.debug(partial_seq) LOG.debug("Adding {0}".format(dq_bases)) LOG.debug("SAME={0}, {1}".format( same, partial_seq[-(len(same)):])) insertion_bases += dq if dt > 0: # deletion LOG.debug("DELETION") last_pos += dt LOG.debug("skipping ahead {0} bases".format(dt)) deletion_bases += dt last_pos += fragment LOG.debug( "LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}" .format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases))) # bgzip and index if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(temp_output_file, output_file, 'fa') except G2GLocationError, le: LOG.debug("Unable to parse location, {0}".format(le.message)) raise le
def gtf2db(input_file, output_file): """ Convert a GTF file into SQLite :param input_file: the GTF file to convert :param output_file: The generated database file """ start = time.time() input_file = g2g_fu.check_file(input_file, 'r') output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) LOG.info("GTF FILE: {0}".format(input_file)) LOG.info("DB File: {0}".format(output_file)) conn = sqlite3.connect(output_file) c = conn.cursor() LOG.debug("Generating tables") c.execute(SQL_CREATE_GTF_TABLE) c.execute(SQL_CREATE_GTF_LOOKUP_TABLE) c.execute(SQL_CREATE_GTF_SOURCES_TABLE) c.execute(SQL_CREATE_GTF_TYPES_TABLE) c.execute(SQL_CREATE_GTF_ATTRIBUTES_TABLE) gtf_types = {} gtf_sources = {} gtf_attributes = {} LOG.info("Parsing GTF file...") gtf_file = GTF(input_file) counter = 0 for record in gtf_file: if counter and counter % 100000 == 0: LOG.info("Processed {0:,} records".format(counter)) if record.type not in gtf_types: _type_key = len(gtf_types.keys()) gtf_types[record.type] = _type_key else: _type_key = gtf_types[record.type] if record.source not in gtf_sources: _source_key = len(gtf_sources.keys()) gtf_sources[record.source] = _source_key else: _source_key = gtf_sources[record.source] strand = 0 if record.strand in ['+', '-']: strand = 1 if record.strand == '+' else -1 gene_id = record.attributes['gene_id'] transcript_id = record.attributes['transcript_id'] if 'transcript_id' in record.attributes else None ensembl_id = None if record.type == 'gene': ensembl_id = record.attributes['gene_id'] elif record.type == 'transcript': ensembl_id = record.attributes['transcript_id'] elif record.type == 'exon': ensembl_id = record.attributes['exon_id'] else: ensembl_id = record.attributes['protein_id'] if 'protein_id' in record.attributes else None c.execute(SQL_INSERT_GTF_TABLE, (gene_id, transcript_id, ensembl_id, record.seqid, record.start, record.end, strand, record.score, _source_key, _type_key, record.frame)) gtf_key = c.lastrowid for attribute, value in record.attributes.iteritems(): if attribute not in ['gene_id', 'transcript_id', 'exon_id']: if attribute not in gtf_attributes: _attribute_key = len(gtf_attributes.keys()) gtf_attributes[attribute] = _attribute_key else: _attribute_key = gtf_attributes[attribute] c.execute(SQL_INSERT_GTF_LOOKUP_TABLE, (gtf_key, _attribute_key, value)) counter += 1 # save (commit) the changes conn.commit() for source, _key in gtf_sources.iteritems(): c.execute(SQL_INSERT_GTF_SOURCES_TABLE, (_key, source)) conn.commit() for type, _key in gtf_types.iteritems(): c.execute(SQL_INSERT_GTF_TYPES_TABLE, (_key, type)) conn.commit() for attribute, _key in gtf_attributes.iteritems(): c.execute(SQL_INSERT_GTF_ATTRIBUTES_TABLE, (_key, attribute)) conn.commit() LOG.info("GTF File parsed") LOG.info("Finalizing database...") for sql in SQL_INDICES_GTF: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_LOOKUP: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_TYPES: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_SOURCES: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_ATTRIBUTES: LOG.debug(sql) c.execute(sql) LOG.info("Database created") # close connection conn.close() LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
for seqid in tb.contigs: processed_seqids[seqid] = False left = VCFtoChainInfo() right = VCFtoChainInfo() chain_info = {} if diploid: left.output_file = g2g_fu.prepend_before_extension(output_file, 'left') right.output_file = g2g_fu.prepend_before_extension(output_file, 'right') chain_info['left'] = left chain_info['right'] = right g2g_fu.delete_file(left.output_file) g2g_fu.delete_file(right.output_file) else: left.output_file = output_file chain_info['left'] = left g2g_fu.delete_file(left.output_file) try: all_chrom = [c for c in fasta_file.references] all_chrom_length = [n for n in fasta_file.lengths] all_vcffiles = [input_file] * len(all_chrom) all_sample_index = [sample_index] * len(all_chrom) all_chain_info = [chain_info] * len(all_chrom) all_diploid = [diploid] * len(all_chrom) all_passed = [passed] * len(all_chrom)
def gtf2db(input_file, output_file): """ Convert a GTF file into SQLite :param input_file: the GTF file to convert :param output_file: The generated database file """ start = time.time() input_file = g2g_fu.check_file(input_file, 'r') output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) LOG.info("GTF FILE: {0}".format(input_file)) LOG.info("DB File: {0}".format(output_file)) conn = sqlite3.connect(output_file) c = conn.cursor() LOG.debug("Generating tables") c.execute(SQL_CREATE_GTF_TABLE) c.execute(SQL_CREATE_GTF_LOOKUP_TABLE) c.execute(SQL_CREATE_GTF_SOURCES_TABLE) c.execute(SQL_CREATE_GTF_TYPES_TABLE) c.execute(SQL_CREATE_GTF_ATTRIBUTES_TABLE) gtf_types = {} gtf_sources = {} gtf_attributes = {} LOG.info("Parsing GTF file...") gtf_file = GTF(input_file) counter = 0 for record in gtf_file: if counter and counter % 100000 == 0: LOG.info("Processed {0:,} records".format(counter)) if record.type not in gtf_types: _type_key = len(gtf_types.keys()) gtf_types[record.type] = _type_key else: _type_key = gtf_types[record.type] if record.source not in gtf_sources: _source_key = len(gtf_sources.keys()) gtf_sources[record.source] = _source_key else: _source_key = gtf_sources[record.source] strand = 0 if record.strand in ['+', '-']: strand = 1 if record.strand == '+' else -1 gene_id = record.attributes['gene_id'] transcript_id = record.attributes[ 'transcript_id'] if 'transcript_id' in record.attributes else None ensembl_id = None if record.type == 'gene': ensembl_id = record.attributes['gene_id'] elif record.type == 'transcript': ensembl_id = record.attributes['transcript_id'] elif record.type == 'exon': ensembl_id = record.attributes['exon_id'] else: ensembl_id = record.attributes[ 'protein_id'] if 'protein_id' in record.attributes else None c.execute(SQL_INSERT_GTF_TABLE, (gene_id, transcript_id, ensembl_id, record.seqid, record.start, record.end, strand, record.score, _source_key, _type_key, record.frame)) gtf_key = c.lastrowid for attribute, value in record.attributes.iteritems(): if attribute not in ['gene_id', 'transcript_id', 'exon_id']: if attribute not in gtf_attributes: _attribute_key = len(gtf_attributes.keys()) gtf_attributes[attribute] = _attribute_key else: _attribute_key = gtf_attributes[attribute] c.execute(SQL_INSERT_GTF_LOOKUP_TABLE, (gtf_key, _attribute_key, value)) counter += 1 # save (commit) the changes conn.commit() for source, _key in gtf_sources.iteritems(): c.execute(SQL_INSERT_GTF_SOURCES_TABLE, (_key, source)) conn.commit() for type, _key in gtf_types.iteritems(): c.execute(SQL_INSERT_GTF_TYPES_TABLE, (_key, type)) conn.commit() for attribute, _key in gtf_attributes.iteritems(): c.execute(SQL_INSERT_GTF_ATTRIBUTES_TABLE, (_key, attribute)) conn.commit() LOG.info("GTF File parsed") LOG.info("Finalizing database...") for sql in SQL_INDICES_GTF: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_LOOKUP: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_TYPES: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_SOURCES: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_ATTRIBUTES: LOG.debug(sql) c.execute(sql) LOG.info("Database created") # close connection conn.close() LOG.info("Execution complete: {0}".format(format_time(start, time.time())))