Example #1
0
def convert_gff_to_bed(input_stream, output_stream, attribute_to_use=None):
    """
    Convert GFF lines from input_stream to
    BED output_stream.
    """
    for line in input_stream:
        if line.startswith("#"):
            # Skip GFF comments
            continue
        gff_fields = line.strip().split("\t")
        chrom = gff_fields[0]
        rec_source = gff_fields[1]
        rec_type = gff_fields[2]
        start = int(gff_fields[3])
        end = int(gff_fields[4])
        strand = gff_fields[6]
        if start > end:
            # Flip coordinates if start > end
            start, end = end, start
        # Convert start to be 0-based
        start = start - 1
        # Convert coordinates back to strings
        start, end = str(start), str(end)
        rec_attributes = gff_fields[-1]
        name = rec_type
        score = "1"
        if rec_attributes != ".":
            # If there's an ID= attribute present,
            # use its value as the BED entry's name
            attributes = utils.parse_attributes(rec_attributes)
            if attribute_to_use is not None:
                if attribute_to_use in attributes:
                    # Use gene_id as ID if found
                    name = attributes[attribute_to_use]
                else:
                    print "WARNING: %s attribute not found" \
                        %(attribute_to_use)
                    name = attributes["ID"]
            elif "ID" in attributes:
                name = attributes["ID"]
        bed_line = make_bed_line(chrom, start, end, name, score, strand)
        output_stream.write("%s\n" % (bed_line))
Example #2
0
def convert_gff_to_bed(input_stream, output_stream, attribute_to_use=None):
    """
    Convert GFF lines from input_stream to
    BED output_stream.
    """
    for line in input_stream:
        if line.startswith("#"):
            # Skip GFF comments
            continue
        gff_fields = line.strip().split("\t")
        chrom = gff_fields[0]
        rec_source = gff_fields[1]
        rec_type = gff_fields[2]
        start = int(gff_fields[3])
        end = int(gff_fields[4])
        strand = gff_fields[6]
        if start > end:
            # Flip coordinates if start > end
            start, end = end, start
        # Convert start to be 0-based
        start = start - 1
        # Convert coordinates back to strings
        start, end = str(start), str(end)
        rec_attributes = gff_fields[-1]
        name = rec_type
        score = "1"
        if rec_attributes != ".":
            # If there's an ID= attribute present,
            # use its value as the BED entry's name
            attributes = utils.parse_attributes(rec_attributes)
            if attribute_to_use is not None:
                if attribute_to_use in attributes:
                    # Use gene_id as ID if found
                    name = attributes[attribute_to_use]
                else:
                    print "WARNING: %s attribute not found" % (attribute_to_use)
                    name = attributes["ID"]
            elif "ID" in attributes:
                name = attributes["ID"]
        bed_line = make_bed_line(chrom, start, end, name, score, strand)
        output_stream.write("%s\n" % (bed_line))
def annotate_gff_with_genes(args):
    """
    Annotate GFF with genes table.
    """
    gff_fname = utils.pathify(args.gff_filename)
    if not os.path.isfile(gff_fname):
        raise Exception, "Cannot find %s" % (gff_fname)
    table_fname = utils.pathify(args.table_filename)
    if not os.path.isfile(table_fname):
        raise Exception, "Cannot find %s" % (table_fname)
    table_bed = get_table_as_bedtool(table_fname)
    # Get BedTool for events, containing only the gene entries
    all_events_bed = pybedtools.BedTool(gff_fname)
    event_genes = \
        all_events_bed.filter(lambda entry: entry.fields[2] == "gene")
    print "Determining overlap between events and genes..."
    # Intersect event genes with gene txStart/txEnd
    intersected_bed = \
        event_genes.intersect(table_bed, wb=True, s=True, f=1)
    # Map event genes to their IDs
    #
    #  event_gene1 -> refseq  -> value
    #              -> ensgene -> value
    #  event_gene2 -> refseq  ->
    #  ...
    event_genes_to_info = \
        defaultdict(lambda: defaultdict(list))
    for entry in intersected_bed:
        event_gene_attrs = utils.parse_attributes(entry.fields[8])
        event_gene_str = event_gene_attrs["ID"]
        gene_info_field = entry.fields[-1]
        # Strip semicolon of ID attributes
        if gene_info_field.endswith(";"):
            gene_info_field = gene_info_field[0:-1]
        # Convert attributes into dictionary
        gene_info = utils.parse_attributes(gene_info_field)
        ensgene_id = gene_info["ensg_id"]
        refseq_id = gene_info["refseq_id"]
        gene_symbol = gene_info["gsymbol"]
        # Skip null entries
        if not is_null_id(ensgene_id):
            event_genes_to_info[event_gene_str]["ensg_id"].append(ensgene_id)
        if not is_null_id(refseq_id):
            event_genes_to_info[event_gene_str]["refseq_id"].append(refseq_id)
        if not is_null_id(gene_symbol):
            event_genes_to_info[event_gene_str]["gsymbol"].append(gene_symbol)
    # Incorporate the gene information into the GFF and output it
    # it using gffutils
    print "Loading events into GFF database..."
    events_db = gffutils.create_db(gff_fname, ":memory:", verbose=False)
    output_fname = gff_fname
    events_out = gffwriter.GFFWriter(output_fname, in_place=True)
    print " - Outputting annotated GFF to: %s" % (output_fname)

    def new_recs():
        for gene_recs in list(events_db.iter_by_parent_childs()):
            gene_rec = gene_recs[0]
            event_id = gene_rec.id
            # Use existing IDs if present
            if "ensgene_id" in gene_rec.attributes:
                ensgene_id = gene_rec.attributes["ensg_id"][0]
            else:
                ensgene_id = "NA"
            if "refseq_id" in gene_rec.attributes:
                refseq_id = gene_rec.attributes["refseq_id"][0]
            else:
                refseq_id = "NA"
            if "gene_symbol" in gene_rec.attributes:
                gene_symbol = gene_rec.attributes["gsymbol"][0]
            else:
                gene_symbol = "NA"
            if event_id in event_genes_to_info:
                event_info = event_genes_to_info[event_id]
                ensgene_ids = \
                    utils.unique_list(event_info["ensg_id"])
                if len(ensgene_ids) > 0 and ensgene_ids[0] != "NA":
                    ensgene_id = ",".join(ensgene_ids)
                refseq_ids = \
                    utils.unique_list(event_info["refseq_id"])
                if len(refseq_ids) > 0 and refseq_ids[0] != "NA":
                    refseq_id = ",".join(refseq_ids)
                gene_symbols = \
                    utils.unique_list(event_info["gsymbol"])
                if len(gene_symbols) > 0 and gene_symbols[0] != "NA":
                    gene_symbol = ",".join(gene_symbols)
            gene_rec.attributes["ensg_id"] = [ensgene_id]
            gene_rec.attributes["refseq_id"] = [refseq_id]
            gene_rec.attributes["gsymbol"] = [gene_symbol]
            # Yield all the gene's records
            for g in gene_recs:
                yield g

    t1 = time.time()
    print "Creating annotated GFF database..."
    annotated_db = gffutils.create_db(new_recs(), ":memory:", verbose=False)
    t2 = time.time()
    print "Creation took %.2f secs" % (t2 - t1)
    # Write to file
    print "Writing annotated GFF to file..."
    for gene_rec in annotated_db.all_features(featuretype="gene"):
        events_out.write_gene_recs(annotated_db, gene_rec.id)
    events_out.close()
Example #4
0
def annotate_gff_with_genes(args):
    """
    Annotate GFF with genes table.
    """
    gff_fname = utils.pathify(args.gff_filename)
    if not os.path.isfile(gff_fname):
        raise Exception, "Cannot find %s" %(gff_fname)
    table_fname = utils.pathify(args.table_filename)
    if not os.path.isfile(table_fname):
        raise Exception, "Cannot find %s" %(table_fname)
    table_bed = get_table_as_bedtool(table_fname)
    # Get BedTool for events, containing only the gene entries
    all_events_bed = pybedtools.BedTool(gff_fname)
    event_genes = \
        all_events_bed.filter(lambda entry: entry.fields[2] == "gene")
    print "Determining overlap between events and genes..."
    # Intersect event genes with gene txStart/txEnd
    intersected_bed = \
        event_genes.intersect(table_bed, wb=True, s=True, f=1)
    # Map event genes to their IDs
    #
    #  event_gene1 -> refseq  -> value
    #              -> ensgene -> value
    #  event_gene2 -> refseq  ->
    #  ...
    event_genes_to_info = \
        defaultdict(lambda: defaultdict(list))
    for entry in intersected_bed:
        event_gene_attrs = utils.parse_attributes(entry.fields[8])
        event_gene_str = event_gene_attrs["ID"]
        gene_info_field = entry.fields[-1]
        # Strip semicolon of ID attributes
        if gene_info_field.endswith(";"):
            gene_info_field = gene_info_field[0:-1]
        # Convert attributes into dictionary
        gene_info = utils.parse_attributes(gene_info_field)
        ensgene_id = gene_info["ensg_id"]
        refseq_id = gene_info["refseq_id"]
        gene_symbol = gene_info["gsymbol"]
        # Skip null entries
        if not is_null_id(ensgene_id):
            event_genes_to_info[event_gene_str]["ensg_id"].append(ensgene_id)
        if not is_null_id(refseq_id):
            event_genes_to_info[event_gene_str]["refseq_id"].append(refseq_id)
        if not is_null_id(gene_symbol):
            event_genes_to_info[event_gene_str]["gsymbol"].append(gene_symbol)
    # Incorporate the gene information into the GFF and output it
    # it using gffutils
    print "Loading events into GFF database..."
    events_db = gffutils.create_db(gff_fname, ":memory:",
                                   verbose=False)
    output_fname = gff_fname 
    events_out = gffwriter.GFFWriter(output_fname,
                                     in_place=True)
    print " - Outputting annotated GFF to: %s" %(output_fname)
    def new_recs():
        for gene_recs in list(events_db.iter_by_parent_childs()):
            gene_rec = gene_recs[0]
            event_id = gene_rec.id
            # Use existing IDs if present
            if "ensgene_id" in gene_rec.attributes:
                ensgene_id = gene_rec.attributes["ensg_id"][0]
            else:
                ensgene_id = "NA"
            if "refseq_id" in gene_rec.attributes:
                refseq_id = gene_rec.attributes["refseq_id"][0]
            else:
                refseq_id = "NA"
            if "gene_symbol" in gene_rec.attributes:
                gene_symbol = gene_rec.attributes["gsymbol"][0]
            else:
                gene_symbol = "NA"
            if event_id in event_genes_to_info:
                event_info = event_genes_to_info[event_id]
                ensgene_ids = \
                    utils.unique_list(event_info["ensg_id"])
                if len(ensgene_ids) > 0 and ensgene_ids[0] != "NA":
                    ensgene_id = ",".join(ensgene_ids)
                refseq_ids = \
                    utils.unique_list(event_info["refseq_id"])
                if len(refseq_ids) > 0 and refseq_ids[0] != "NA":
                    refseq_id = ",".join(refseq_ids)
                gene_symbols = \
                    utils.unique_list(event_info["gsymbol"])
                if len(gene_symbols) > 0 and gene_symbols[0] != "NA":
                    gene_symbol = ",".join(gene_symbols)
            gene_rec.attributes["ensg_id"] = [ensgene_id]
            gene_rec.attributes["refseq_id"] = [refseq_id]
            gene_rec.attributes["gsymbol"] = [gene_symbol]
            # Yield all the gene's records
            for g in gene_recs:
                yield g
    t1 = time.time()
    print "Creating annotated GFF database..."
    annotated_db = gffutils.create_db(new_recs(), ":memory:",
                                      verbose=False)
    t2 = time.time()
    print "Creation took %.2f secs" %(t2 - t1)
    # Write to file
    print "Writing annotated GFF to file..."
    for gene_rec in annotated_db.all_features(featuretype="gene"):
        events_out.write_gene_recs(annotated_db, gene_rec.id)
    events_out.close()