def convert_gff_to_bed(input_stream, output_stream, attribute_to_use=None): """ Convert GFF lines from input_stream to BED output_stream. """ for line in input_stream: if line.startswith("#"): # Skip GFF comments continue gff_fields = line.strip().split("\t") chrom = gff_fields[0] rec_source = gff_fields[1] rec_type = gff_fields[2] start = int(gff_fields[3]) end = int(gff_fields[4]) strand = gff_fields[6] if start > end: # Flip coordinates if start > end start, end = end, start # Convert start to be 0-based start = start - 1 # Convert coordinates back to strings start, end = str(start), str(end) rec_attributes = gff_fields[-1] name = rec_type score = "1" if rec_attributes != ".": # If there's an ID= attribute present, # use its value as the BED entry's name attributes = utils.parse_attributes(rec_attributes) if attribute_to_use is not None: if attribute_to_use in attributes: # Use gene_id as ID if found name = attributes[attribute_to_use] else: print "WARNING: %s attribute not found" \ %(attribute_to_use) name = attributes["ID"] elif "ID" in attributes: name = attributes["ID"] bed_line = make_bed_line(chrom, start, end, name, score, strand) output_stream.write("%s\n" % (bed_line))
def convert_gff_to_bed(input_stream, output_stream, attribute_to_use=None): """ Convert GFF lines from input_stream to BED output_stream. """ for line in input_stream: if line.startswith("#"): # Skip GFF comments continue gff_fields = line.strip().split("\t") chrom = gff_fields[0] rec_source = gff_fields[1] rec_type = gff_fields[2] start = int(gff_fields[3]) end = int(gff_fields[4]) strand = gff_fields[6] if start > end: # Flip coordinates if start > end start, end = end, start # Convert start to be 0-based start = start - 1 # Convert coordinates back to strings start, end = str(start), str(end) rec_attributes = gff_fields[-1] name = rec_type score = "1" if rec_attributes != ".": # If there's an ID= attribute present, # use its value as the BED entry's name attributes = utils.parse_attributes(rec_attributes) if attribute_to_use is not None: if attribute_to_use in attributes: # Use gene_id as ID if found name = attributes[attribute_to_use] else: print "WARNING: %s attribute not found" % (attribute_to_use) name = attributes["ID"] elif "ID" in attributes: name = attributes["ID"] bed_line = make_bed_line(chrom, start, end, name, score, strand) output_stream.write("%s\n" % (bed_line))
def annotate_gff_with_genes(args): """ Annotate GFF with genes table. """ gff_fname = utils.pathify(args.gff_filename) if not os.path.isfile(gff_fname): raise Exception, "Cannot find %s" % (gff_fname) table_fname = utils.pathify(args.table_filename) if not os.path.isfile(table_fname): raise Exception, "Cannot find %s" % (table_fname) table_bed = get_table_as_bedtool(table_fname) # Get BedTool for events, containing only the gene entries all_events_bed = pybedtools.BedTool(gff_fname) event_genes = \ all_events_bed.filter(lambda entry: entry.fields[2] == "gene") print "Determining overlap between events and genes..." # Intersect event genes with gene txStart/txEnd intersected_bed = \ event_genes.intersect(table_bed, wb=True, s=True, f=1) # Map event genes to their IDs # # event_gene1 -> refseq -> value # -> ensgene -> value # event_gene2 -> refseq -> # ... event_genes_to_info = \ defaultdict(lambda: defaultdict(list)) for entry in intersected_bed: event_gene_attrs = utils.parse_attributes(entry.fields[8]) event_gene_str = event_gene_attrs["ID"] gene_info_field = entry.fields[-1] # Strip semicolon of ID attributes if gene_info_field.endswith(";"): gene_info_field = gene_info_field[0:-1] # Convert attributes into dictionary gene_info = utils.parse_attributes(gene_info_field) ensgene_id = gene_info["ensg_id"] refseq_id = gene_info["refseq_id"] gene_symbol = gene_info["gsymbol"] # Skip null entries if not is_null_id(ensgene_id): event_genes_to_info[event_gene_str]["ensg_id"].append(ensgene_id) if not is_null_id(refseq_id): event_genes_to_info[event_gene_str]["refseq_id"].append(refseq_id) if not is_null_id(gene_symbol): event_genes_to_info[event_gene_str]["gsymbol"].append(gene_symbol) # Incorporate the gene information into the GFF and output it # it using gffutils print "Loading events into GFF database..." events_db = gffutils.create_db(gff_fname, ":memory:", verbose=False) output_fname = gff_fname events_out = gffwriter.GFFWriter(output_fname, in_place=True) print " - Outputting annotated GFF to: %s" % (output_fname) def new_recs(): for gene_recs in list(events_db.iter_by_parent_childs()): gene_rec = gene_recs[0] event_id = gene_rec.id # Use existing IDs if present if "ensgene_id" in gene_rec.attributes: ensgene_id = gene_rec.attributes["ensg_id"][0] else: ensgene_id = "NA" if "refseq_id" in gene_rec.attributes: refseq_id = gene_rec.attributes["refseq_id"][0] else: refseq_id = "NA" if "gene_symbol" in gene_rec.attributes: gene_symbol = gene_rec.attributes["gsymbol"][0] else: gene_symbol = "NA" if event_id in event_genes_to_info: event_info = event_genes_to_info[event_id] ensgene_ids = \ utils.unique_list(event_info["ensg_id"]) if len(ensgene_ids) > 0 and ensgene_ids[0] != "NA": ensgene_id = ",".join(ensgene_ids) refseq_ids = \ utils.unique_list(event_info["refseq_id"]) if len(refseq_ids) > 0 and refseq_ids[0] != "NA": refseq_id = ",".join(refseq_ids) gene_symbols = \ utils.unique_list(event_info["gsymbol"]) if len(gene_symbols) > 0 and gene_symbols[0] != "NA": gene_symbol = ",".join(gene_symbols) gene_rec.attributes["ensg_id"] = [ensgene_id] gene_rec.attributes["refseq_id"] = [refseq_id] gene_rec.attributes["gsymbol"] = [gene_symbol] # Yield all the gene's records for g in gene_recs: yield g t1 = time.time() print "Creating annotated GFF database..." annotated_db = gffutils.create_db(new_recs(), ":memory:", verbose=False) t2 = time.time() print "Creation took %.2f secs" % (t2 - t1) # Write to file print "Writing annotated GFF to file..." for gene_rec in annotated_db.all_features(featuretype="gene"): events_out.write_gene_recs(annotated_db, gene_rec.id) events_out.close()
def annotate_gff_with_genes(args): """ Annotate GFF with genes table. """ gff_fname = utils.pathify(args.gff_filename) if not os.path.isfile(gff_fname): raise Exception, "Cannot find %s" %(gff_fname) table_fname = utils.pathify(args.table_filename) if not os.path.isfile(table_fname): raise Exception, "Cannot find %s" %(table_fname) table_bed = get_table_as_bedtool(table_fname) # Get BedTool for events, containing only the gene entries all_events_bed = pybedtools.BedTool(gff_fname) event_genes = \ all_events_bed.filter(lambda entry: entry.fields[2] == "gene") print "Determining overlap between events and genes..." # Intersect event genes with gene txStart/txEnd intersected_bed = \ event_genes.intersect(table_bed, wb=True, s=True, f=1) # Map event genes to their IDs # # event_gene1 -> refseq -> value # -> ensgene -> value # event_gene2 -> refseq -> # ... event_genes_to_info = \ defaultdict(lambda: defaultdict(list)) for entry in intersected_bed: event_gene_attrs = utils.parse_attributes(entry.fields[8]) event_gene_str = event_gene_attrs["ID"] gene_info_field = entry.fields[-1] # Strip semicolon of ID attributes if gene_info_field.endswith(";"): gene_info_field = gene_info_field[0:-1] # Convert attributes into dictionary gene_info = utils.parse_attributes(gene_info_field) ensgene_id = gene_info["ensg_id"] refseq_id = gene_info["refseq_id"] gene_symbol = gene_info["gsymbol"] # Skip null entries if not is_null_id(ensgene_id): event_genes_to_info[event_gene_str]["ensg_id"].append(ensgene_id) if not is_null_id(refseq_id): event_genes_to_info[event_gene_str]["refseq_id"].append(refseq_id) if not is_null_id(gene_symbol): event_genes_to_info[event_gene_str]["gsymbol"].append(gene_symbol) # Incorporate the gene information into the GFF and output it # it using gffutils print "Loading events into GFF database..." events_db = gffutils.create_db(gff_fname, ":memory:", verbose=False) output_fname = gff_fname events_out = gffwriter.GFFWriter(output_fname, in_place=True) print " - Outputting annotated GFF to: %s" %(output_fname) def new_recs(): for gene_recs in list(events_db.iter_by_parent_childs()): gene_rec = gene_recs[0] event_id = gene_rec.id # Use existing IDs if present if "ensgene_id" in gene_rec.attributes: ensgene_id = gene_rec.attributes["ensg_id"][0] else: ensgene_id = "NA" if "refseq_id" in gene_rec.attributes: refseq_id = gene_rec.attributes["refseq_id"][0] else: refseq_id = "NA" if "gene_symbol" in gene_rec.attributes: gene_symbol = gene_rec.attributes["gsymbol"][0] else: gene_symbol = "NA" if event_id in event_genes_to_info: event_info = event_genes_to_info[event_id] ensgene_ids = \ utils.unique_list(event_info["ensg_id"]) if len(ensgene_ids) > 0 and ensgene_ids[0] != "NA": ensgene_id = ",".join(ensgene_ids) refseq_ids = \ utils.unique_list(event_info["refseq_id"]) if len(refseq_ids) > 0 and refseq_ids[0] != "NA": refseq_id = ",".join(refseq_ids) gene_symbols = \ utils.unique_list(event_info["gsymbol"]) if len(gene_symbols) > 0 and gene_symbols[0] != "NA": gene_symbol = ",".join(gene_symbols) gene_rec.attributes["ensg_id"] = [ensgene_id] gene_rec.attributes["refseq_id"] = [refseq_id] gene_rec.attributes["gsymbol"] = [gene_symbol] # Yield all the gene's records for g in gene_recs: yield g t1 = time.time() print "Creating annotated GFF database..." annotated_db = gffutils.create_db(new_recs(), ":memory:", verbose=False) t2 = time.time() print "Creation took %.2f secs" %(t2 - t1) # Write to file print "Writing annotated GFF to file..." for gene_rec in annotated_db.all_features(featuretype="gene"): events_out.write_gene_recs(annotated_db, gene_rec.id) events_out.close()