Example #1
0
def main():

    args = cli(sys.argv[0], sys.argv[1:])

    if args.go is not None:
        go = parse_rfam2go(args.go)
    else:
        go = {}

    for line in args.infile:
        if line.startswith("#"):
            continue

        record = GFFRecord.parse(line)
        attrs = record.attributes

        if args.best and attrs.custom["olp"] == "=":
            continue

        name = record.type
        dbxrefs = ["Rfam:" + attrs.custom["mdlaccn"], "Rfam:" + name]
        if "clan" in attrs.custom:
            dbxrefs.append("RfamClan:" + attrs.custom["clan"])

        ontology_terms = go.get(attrs.custom["mdlaccn"], [])
        notes = [attrs.custom["desc"]]

        target = Target(
            attrs.custom["mdlaccn"],
            int(attrs.custom["mdlfrom"]),
            int(attrs.custom["mdlto"]),
        )

        custom = {
            "evalue": attrs.custom["evalue"],
            "model_type": attrs.custom["mdl"],
            "gc": attrs.custom["gc"],
            "bias": attrs.custom["bias"],
            "bitscore": record.score,
        }

        if attrs.custom["trunc"] == "yes":
            custom["truncated_match"] = "true"

        if attrs.custom["olp"] == "=":
            custom["overlap_with_better_score"] = "true"

        record.source = args.source
        record.type = args.type
        record.score = float(custom["evalue"])
        record.attributes = GFFAttributes(
            name=name,
            dbxref=dbxrefs,
            target=target,
            note=notes,
            ontology_term=ontology_terms,
            custom=custom,
        )

        print(record, file=args.outfile)
Example #2
0
def decode_gff(infiles, outfile, map_, column):
    inhandles = join_files(infiles, header=False)

    if column == "id":
        trans_function = replace_gff_id
    elif column == "name":
        trans_function = replace_gff_name
    elif column == "seqid":
        trans_function = replace_gff_seqid
    else:
        raise ValueError("This shouldn't ever happen")

    record_chunk = list()
    for i, line in enumerate(inhandles):
        if line.startswith("#"):
            record_chunk.append(line.strip())
            continue

        old_record = GFFRecord.parse(line)
        new_records = trans_function(old_record, map_)
        record_chunk.append(str(new_records))

        if i % 10000 == 0:
            outfile.write("\n".join(record_chunk))
            record_chunk = list()

    if len(record_chunk) > 0:
        outfile.write("\n".join(record_chunk))

    return
Example #3
0
def encode_gff(
    infiles,
    outfile,
    mapfile,
    column,
    id_conv,
):
    inhandles = join_files(infiles, header=False)
    seen = dict()

    if column == "id":
        trans_function = replace_gff_id
    elif column == "name":
        trans_function = replace_gff_name
    elif column == "seqid":
        trans_function = replace_gff_seqid
    else:
        raise ValueError("This shouldn't ever happen")

    id_chunk = list()
    record_chunk = list()

    for i, line in enumerate(inhandles):
        if line.startswith("#"):
            record_chunk.append(line.strip())
            continue

        old_record = GFFRecord.parse(line)
        new_record = trans_function(
            old_record,
            seen,
            id_chunk,
            lambda x: next(id_conv),
        )
        record_chunk.append(str(new_record))

        if i % 10000 == 0:
            if len(record_chunk) > 0:
                outfile.write("\n".join(record_chunk))
                outfile.write("\n")
                record_chunk = list()

            if len(id_chunk) > 0:
                mapfile.write("".join(f"{n}\t{o}\n" for n, o in id_chunk))
                id_chunk = list()

    if len(record_chunk) > 0:
        outfile.write("\n".join(record_chunk))
        outfile.write("\n")

    if len(id_chunk) > 0:
        mapfile.write("".join(f"{n}\t{o}\n" for n, o in id_chunk))
    return
Example #4
0
def rnammer2gff(args: argparse.Namespace) -> None:
    records: List[GFFRecord] = []

    for line in args.infile:
        if line.startswith("#"):
            continue

        sline = line.strip().split("\t")
        rrna_type = sline[8]
        new_type = TYPE_MAP[args.kingdom][rrna_type.lower()]
        sline[1] = args.source
        sline[2] = new_type
        sline[8] = "."

        rna_record = GFFRecord.parse("\t".join(sline))
        gene_record = deepcopy(rna_record)
        gene_record.type = "rRNA_gene"
        gene_record.add_child(rna_record)

        records.append(gene_record)
        records.append(rna_record)

    num = 0
    for record in GFF(records).traverse_children(sort=True):
        if record.attributes is None:
            attr = GFFAttributes()
            record.attributes = attr
        else:
            attr = record.attributes

        if record.type == "rRNA_gene":
            num += 1
            attr.id = f"rRNA_gene{num}"
        else:
            attr.id = f"rRNA{num}"
            attr.parent = [
                p.attributes.id for p in record.parents
                if (p.attributes is not None and p.attributes.id is not None)
            ]

        print(record, file=args.outfile)

    return
Example #5
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])

    rows = list()
    for line in args.infile:
        if line.startswith("#"):
            continue
        record = GFFRecord.parse(line)
        record.attributes.id = record.attributes.name
        record.attributes.name = None
        record.attributes.custom = {}
        rows.append(record)

    gff = GFF(rows)
    gff.infer_missing_parents()

    counter = 1

    for mrna in gff.select_type("mRNA"):
        if len(mrna.children) < 2:
            continue

        region = deepcopy(mrna)
        region.type = "repeat_region"
        region.attributes.id = f"repeat_region{counter}"
        region.attributes.ontology_term = ["SO:0000657"],

        mrna.type = "helitron"
        mrna.parents = [region]
        mrna.attributes.parent = [region.attributes.id]
        mrna.attributes.id = f"helitron{counter}"
        mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"]
        mrna.attributes.custom = {}

        flank3 = [c for c in mrna.children
                  if c.attributes.id.endswith(".3")][0]
        flank5 = [c for c in mrna.children
                  if c.attributes.id.endswith(".5.1")][0]

        flank3.type = "three_prime_flanking_region"
        flank3.attributes.ontology_term = [
            "SO:0001417",
            "SO:three_prime_flanking_region",
            "SO:0000364",
            "SO:transposable_element_flanking_region"
        ]
        flank3.attributes.id = None
        flank3.attributes.parent = [mrna.attributes.id]

        flank5.type = "five_prime_flanking_region"
        flank5.attributes.ontology_term = [
            "SO:0001416",
            "SO:five_prime_flanking_region",
            "SO:0000364",
            "SO:transposable_element_flanking_region"
        ]
        flank5.attributes.id = None
        flank5.attributes.parent = [mrna.attributes.id]

        mrna.source = flank5.source

        print(region, file=args.outfile)
        print(mrna, file=args.outfile)
        if mrna.strand == Strand.MINUS:
            print(flank3, file=args.outfile)
            print(flank5, file=args.outfile)
        else:
            print(flank5, file=args.outfile)
            print(flank3, file=args.outfile)

        counter += 1

    return
Example #6
0
def deal_with_block(block: List[str], gene_num: int) -> List[GFF3Record]:

    parsed: Dict[str, List[GFFRecord[GTFAttributes]]] = dict()
    for line in block:
        rec = GFFRecord.parse(line, attr=GTFAttributes)

        if rec.type in parsed:
            parsed[rec.type].append(rec)
        else:
            parsed[rec.type] = [rec]

    assert len(parsed["gene"]) == 1
    assert len(parsed["similarity"]) == 1
    gene_parsed = parsed["gene"][0]
    similarity_parsed = parsed["similarity"][0]

    custom: Dict[str, str] = dict()
    if similarity_parsed.attributes is not None:
        custom["query"] = similarity_parsed.attributes.custom["Query"]

    if gene_parsed.attributes is not None:
        custom["identity"] = gene_parsed.attributes.custom["identity"]
        custom["similarity"] = gene_parsed.attributes.custom["similarity"]

    gene = GFF3Record(
        parsed["gene"][0].seqid,
        "exonerate",
        type="gene",
        start=parsed["gene"][0].start,
        end=parsed["gene"][0].end,
        score=parsed["gene"][0].score,
        strand=parsed["gene"][0].strand,
        phase=parsed["gene"][0].phase,
        attributes=GFF3Attributes(
            id=f"gene{gene_num}",
            custom=custom,
        )
    )

    cdss = [
        GFF3Record(
            e.seqid,
            "exonerate",
            "CDS",
            e.start,
            e.end,
            e.score,
            e.strand,
            e.phase,
            attributes=GFF3Attributes(
                id=f"CDS{gene_num}",
                parent=[f"mRNA{gene_num}"],
                custom=(e.attributes.custom
                        if e.attributes is not None
                        else None)
            )
        )
        for e
        in parsed["exon"]
    ]

    for c in cdss:
        if gene.attributes is not None:
            # This is safe because we added attributes.
            assert c.attributes is not None
            c.attributes.custom["query"] = gene.attributes.custom["query"]

    mrna = GFF3Record.infer_from_children(
        cdss,
        id=f"mRNA{gene_num}",
        seqid=gene.seqid,
        source="exonerate",
        type="mRNA",
        strand=gene.strand,
        score=gene.score,
    )

    mrna.add_parent(gene)

    if gene.attributes is not None:
        # This is safe because infer_from_children adds an ID to attributes.
        assert mrna.attributes is not None
        if gene.attributes.id is not None:
            mrna.attributes.parent = [gene.attributes.id]
        mrna.attributes.custom["query"] = gene.attributes.custom["query"]

    out = [gene, mrna]
    out.extend(cdss)
    return out