예제 #1
0
def load_gff(gff):
    genes = defaultdict(list)
    gene_exon_positions = defaultdict(lambda: defaultdict(tuple))
    try:
        with open(gff) as g:
            for line in g:
                if line.startswith('#') or 'contig' in line:
                    continue
                feature = GFF(line)
                gene_id = get_gene_attribute(feature, "ID")
                if feature.featuretype == 'exon':
                    gene_exon_positions[feature.genename][gene_id] = (
                        feature.start, feature.end)
                if feature.featuretype == 'CDS':
                    for exon in gene_exon_positions[feature.genename]:
                        e = gene_exon_positions[feature.genename][exon]
                        if e[0] <= feature.start <= e[1] and e[
                                0] <= feature.end <= e[1]:
                            gene_id = exon + "_CDS"
                if gene_id is None:
                    print("No gene id for CDS found", feature, end="")
                feature.id = gene_id
                genes[feature.genename].append(feature)

    except IOError:
        print("Failed to load GFF file {}".format(gff))
        sys.exit()

    return genes
예제 #2
0
def load_crossmap(crossmapout):
    genes = defaultdict(list)
    gene_exon_positions = defaultdict(lambda: defaultdict(tuple))
    try:
        with open(crossmapout) as c:
            for line in c:
                cm = None
                if 'fail' in line:
                    featureline = line.split('\tfail')[0]
                if '->' in line:
                    featureline, cmline = line.split('\t->\t')
                    cm = GFF(cmline)
                    cm.attributes += ';Note=CrossMap'
                feature = GFF(featureline)
                feature.crossmap = cm
                gene_id = get_gene_attribute(feature, "ID")
                if feature.featuretype == 'exon':
                    gene_exon_positions[feature.genename][gene_id] = (
                        feature.start, feature.end)
                if feature.featuretype == 'CDS':
                    for exon in gene_exon_positions[feature.genename]:
                        e = gene_exon_positions[feature.genename][exon]
                        if e[0] <= feature.start <= e[1] and e[
                                0] <= feature.end <= e[1]:
                            gene_id = exon + "_CDS"

                    if feature.crossmap:  # Lose CrossMap features where the CDSs aren't the same length
                        cds_len = feature.end - feature.start + 1
                        cm_len = feature.crossmap.end - feature.crossmap.start + 1
                        if cds_len != cm_len:
                            feature.crossmap = None
                if gene_id is None:
                    print("No gene id for CDS found", feature, end="")
                feature.id = gene_id

                genes[feature.genename].append(feature)

    except IOError:
        print("Failed to load CrossMap output {}".format(crossmapout))
        sys.exit()

    return genes
예제 #3
0
def make_output_feature(outstart, outend, feature, new_parts):
    if outstart is not None and outend is not None:
        if outstart > outend:
            outstart, outend = outend, outstart
        strand = feature.strand
        if new_parts[0].strand == -1:
            if feature.strand == '+':
                strand = '-'
            elif feature.strand == '-':
                strand = '+'
        return GFF(new_parts[0].oldname, feature.source, feature.featuretype,
                   outstart, outend, feature.score, strand, feature.phase,
                   feature.attributes)
    else:
        return None
예제 #4
0
def transfer_gff_feature(feature, genome, crossmap=False):
    output_feature = outstart = outend = status = None

    if crossmap and feature.crossmap:
        output_feature = GFF(feature.crossmap.scaffold,
                             feature.crossmap.source,
                             feature.crossmap.featuretype,
                             feature.crossmap.start, feature.crossmap.end,
                             feature.crossmap.score, feature.crossmap.strand,
                             feature.crossmap.phase,
                             feature.crossmap.attributes)
        status = 'crossmap'
        return output_feature, status

    new_parts, haps = get_parts(feature.scaffold, feature.start, feature.end,
                                genome)

    if not new_parts:
        if not haps:
            status = 'missing'
        else:
            status = haps[0].parttype
    elif len(new_parts) == 1 and not haps:
        np = new_parts[0]
        if 'removed' not in np.parttype:
            outstart, outend = new_parts[0].oldstart, new_parts[0].oldend
            status = 'ok'
        else:
            status = 'removed'
    else:
        scaffolds = {}
        for part in new_parts:
            if 'removed' in part.parttype:
                continue
            if part.newstart <= feature.start <= part.newend:
                outstart = part.oldstart if part.strand == 1 else part.oldend
                scaffolds[part.oldname] = 1
            if part.newstart <= feature.end <= part.newend:
                outend = part.oldend if part.strand == 1 else part.oldstart
                scaffolds[part.oldname] = 1
        if outstart is not None and outend is not None:
            status = 'ok'
        if len(scaffolds) > 1:
            outstart = outend = None
            status = 'multiscaffold'
        if feature.featuretype == 'CDS':  # Do not allow CDSs to span multiple parts, but try to find a CrossMap hit
            outstart = outend = None
            status = 'broken'

    output_feature = make_output_feature(outstart, outend, feature, new_parts)

    if output_feature is None and feature.crossmap:
        output_feature = GFF(feature.crossmap.scaffold,
                             feature.crossmap.source,
                             feature.crossmap.featuretype,
                             feature.crossmap.start, feature.crossmap.end,
                             feature.crossmap.score, feature.crossmap.strand,
                             feature.crossmap.phase,
                             feature.crossmap.attributes)
        status = 'crossmap'

    if output_feature is None:
        if status is None:
            status = 'broken'
        return GFF(feature.scaffold, feature.source, feature.featuretype,
                   feature.start, feature.end, feature.score, feature.strand,
                   feature.phase, feature.attributes), status
    else:
        return output_feature, status