Python Gene.fill_parentsの例、ppanggolin.genome.Gene.fill_parents Pythonの例

コード例 #1

0

ファイルを表示

ファイル: annotate.py プロジェクト: pythseq/PPanGGOLiN

def create_gene(org, contig, geneCounter, rnaCounter, ID, dbxref, start, stop, strand, gene_type, position = None, gene_name = "", product = "", genetic_code = 11, protein_id = ""):
    if any('MaGe' in dbref for dbref in dbxref):
        if gene_name == "":
            gene_name = ID
        for val in dbxref:
            if 'MaGe' in val:
                ID = val.split(':')[1]
                break
    if gene_type == "CDS":
        if ID == "":
            ID = protein_id#on rare occasions, there are no 'locus_tag' from downloaded .gbk file. So we use the protein_id field instead. (which is not supposed to be unique, but was when cases like this were encountered)

        newGene = Gene(org.name + "_CDS_"+ str(geneCounter).zfill(4))
        newGene.fill_annotations(start = start,
                                stop = stop,
                                strand = strand,
                                geneType = gene_type,
                                position = position,
                                name = gene_name,
                                product = product,
                                genetic_code = genetic_code,
                                local_identifier = ID)
        contig.addGene(newGene)
    else:# if not CDS, it is a RNA
        newGene = RNA(org.name + "_RNA_"+ str(rnaCounter).zfill(4))
        newGene.fill_annotations(start = start,
                                stop = stop,
                                strand = strand,
                                geneType = gene_type,
                                name = gene_name,
                                product = product)
        contig.addRNA(newGene)
    newGene.fill_parents(org, contig)

コード例 #2

0

ファイルを表示

def create_gene(org, contig, ID, dbxref, start, stop, strand, gene_type, position = None, gene_name = "", product = "", genetic_code = 11):
    if any('MaGe' in dbref for dbref in dbxref):
        if gene_name == "":
            gene_name = ID
        for val in dbxref:
            if 'MaGe' in val:
                ID = val.split(':')[1]
                break
    if gene_type == "CDS":
        newGene = Gene(ID)
        newGene.fill_annotations(start = start,
                                stop = stop,
                                strand = strand,
                                geneType = gene_type,
                                position = position,
                                name = gene_name,
                                product = product,
                                genetic_code = genetic_code)
        contig.addGene(newGene)
    else:#either a CDS, or a RNA
        newGene = RNA(ID)
        newGene.fill_annotations(start = start,
                                stop = stop,
                                strand = strand,
                                geneType = gene_type,
                                name = gene_name,
                                product = product)
        contig.addRNA(newGene)
    newGene.fill_parents(org, contig)

コード例 #3

0

ファイルを表示

ファイル: test_Region.py プロジェクト: labgem/PPanGGOLiN

def l_genes(o_org, o_contig):
    """ creates a small gene set for testing.

        returns a list of 4 genes that belongs
        to the same contig and the same organism."""
    l_genes = []
    c = 10
    for i, gene_id in enumerate([
            "toto",
            "tata",
            "titi",
            "tutu",
            "lolo",
            "lala",
            "lili",
            "lulu",
    ]):
        gene = Gene(gene_id)
        gene.fill_annotations(c, c + 30, "+", position=i)
        gene.fill_parents(o_org, o_contig)
        o_contig.addGene(gene)
        gene.family = GeneFamily(i, gene_id)
        gene.family.addPartition("c-cloud")
        l_genes.append(gene)
        c += 35
    return l_genes

コード例 #4

0

ファイルを表示

ファイル: test_GeneFamily.py プロジェクト: vinisalazar/PPanGGOLiN

def test_mkBitArray_with_org(o_family):
    organism = "organism"
    o_gene = Gene(33)
    o_gene.fill_parents(organism, None)

    o_family.addGene(o_gene)

    for i in 1,3,7,12:
        index = { organism: i }
        o_family.mkBitarray(index)
        assert o_family.bitarray == 1<<i

コード例 #5

0

ファイルを表示

ファイル: test_Pangenome.py プロジェクト: zhaoc1/PPanGGOLiN

    def _make_gene_pair(org, gene_id1, gene_id2):
        """create a pair of genes that belong to the same organism."""
        lo_genes = []
        for k in gene_id1, gene_id2:
            o_gene = Gene(k)
            o_gene.fill_parents(org,None)

            lo_genes.append(o_gene)

            o_family = GeneFamily(k,k)
            o_family.addGene(o_gene)

        return tuple(lo_genes)

コード例 #6

0

ファイルを表示

ファイル: test_Region.py プロジェクト: zhaoc1/PPanGGOLiN

def l_glist(o_org, o_contig):
    """ creates a small testing context, with 4 CDS, 1 RNA that are all on the same contig in the same organism"""
    l_glist = []
    c = 10
    for i, gene_id in enumerate(["toto", "tata", "titi", "tutu"]):
        gene = Gene(gene_id)
        gene.fill_annotations(c, c + 30, "+", position=i)
        gene.fill_parents(o_org, o_contig)
        o_contig.addGene(gene)
        gene.family = gene_id
        l_glist.append(gene)
        c += 35
    return l_glist

コード例 #7

0

ファイルを表示

    def _make_gene_pair(org, gene_id1, gene_id2):
        """create 2 genes from org.
            each gene belong to its own family."""
        lo_genes = []
        for k in gene_id1, gene_id2:
            o_gene = Gene(k)
            o_gene.fill_parents(org, None)

            lo_genes.append(o_gene)

            o_family = GeneFamily(k, k)
            o_family.addGene(o_gene)

        return tuple(lo_genes)

コード例 #8

0

ファイルを表示

def test_cstr():
    o_src = Gene('source')
    o_tgt = Gene('target')

    # set organism and contig to None.
    o_src.fill_parents(None, None)
    o_tgt.fill_parents(None, None)

    # define the None GeneFamily, and add the 2 genes to it.
    o_family = GeneFamily(None, None)
    o_family.addGene(o_src)
    o_family.addGene(o_tgt)

    o_edge = Edge(o_src, o_tgt)
    assert isinstance(o_edge, Edge)

    assert o_edge.source == o_src.family
    assert o_edge.target == o_tgt.family
    assert dict(o_edge.organisms) == {None: [(o_src, o_tgt)]}

コード例 #9

0

ファイルを表示

def test_cstr_error():
    o_src = Gene('source')
    o_tgt = Gene('target')
    # genes should have a family
    with pytest.raises(Exception):
        o_edge = Edge(o_src, o_tgt)

    o_family = GeneFamily(None, None)
    o_family.addGene(o_src)
    # both genes sould have a family
    with pytest.raises(Exception):
        o_edge = Edge(o_src, o_tgt)

    # gene should belong to the same organism
    o_family.addGene(o_tgt)
    o_src.fill_parents("", None)
    o_tgt.fill_parents(None, None)
    with pytest.raises(Exception):
        o_edge = Edge(o_src, o_tgt)

コード例 #10

0

ファイルを表示

ファイル: test_GeneFamily.py プロジェクト: vinisalazar/PPanGGOLiN

def filled_families():
    """
    return a list of families and genes.
    there will be between 3 and 10 genes/families.
    Each family has only one gene.
    """
    lo_genes = []
    lo_fam   = []

    n_families = randint(3, 10)
    for fam in range(n_families):
        o_gene = Gene(fam)
        o_gene.fill_parents(None, None)

        o_family = GeneFamily(fam,fam)
        o_family.addGene(o_gene)

        lo_genes.append(o_gene)
        lo_fam.append(o_family)

    return lo_fam, lo_genes

コード例 #11

0

ファイルを表示

ファイル: readBinaries.py プロジェクト: zhaoc1/PPanGGOLiN

def readOrganism(pangenome, orgName, contigDict, circularContigs, link=False):
    org = Organism(orgName)
    for contigName, geneList in contigDict.items():
        contig = org.getOrAddContig(contigName,
                                    is_circular=circularContigs[contigName])
        for row in geneList:
            if link:  #if the gene families are already computed/loaded the gene exists.
                gene = pangenome.getGene(row["ID"].decode())
            else:  #else creating the gene.
                gene_type = row["type"].decode()
                if gene_type == "CDS":
                    gene = Gene(row["ID"].decode())
                elif "RNA" in gene_type:
                    gene = RNA(row["ID"].decode())
            try:
                local = row["local"].decode()
            except ValueError:
                local = ""
            gene.fill_annotations(start=row["start"],
                                  stop=row["stop"],
                                  strand=row["strand"].decode(),
                                  geneType=row["type"].decode(),
                                  position=row["position"],
                                  genetic_code=row["genetic_code"],
                                  name=row["name"].decode(),
                                  product=row["product"].decode(),
                                  local_identifier=local)
            gene.is_fragment = row["is_fragment"]
            gene.fill_parents(org, contig)
            if gene_type == "CDS":
                contig.addGene(gene)
            elif "RNA" in gene_type:
                contig.addRNA(gene)
            else:
                raise Exception(
                    f"A strange type '{gene_type}', which we do not know what to do with, was met."
                )
    pangenome.addOrganism(org)

コード例 #12

0

ファイルを表示

ファイル: annotate.py プロジェクト: Alastor-pentious/PPanGGOLiN

def read_org_gff(organism,
                 gff_file_path,
                 circular_contigs,
                 getSeq,
                 pseudo=False):
    (GFF_seqname, _,
     GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range(
         0, 9)  #missing values : source, score, frame. They are unused.

    def getGffAttributes(gff_fields):
        """
            Parses the gff attribute's line and outputs the attributes in a dict structure.
            :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff.
            :type list:
            :return: attributes:
            :rtype: dict
        """
        attributes_field = [
            f for f in gff_fields[GFF_attribute].strip().split(';')
            if len(f) > 0
        ]
        attributes = {}
        for att in attributes_field:
            try:
                (key, value) = att.strip().split('=')
                attributes[key.upper()] = value
            except ValueError:
                pass  #we assume that it is a strange, but useless field for our analysis
        return attributes

    def getIDAttribute(attributes):
        """
            Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found.
            :param attribute:
            :type dict:
            :return: ElementID:
            :rtype: string
        """
        ElementID = attributes.get("ID")
        if not ElementID:
            logging.getLogger().error(
                "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: "
                + gff_file_path)
            exit(1)
        return ElementID

    hasFasta = False
    fastaString = ""
    org = Organism(organism)
    geneCounter = 0
    rnaCounter = 0
    with read_compressed_or_not(gff_file_path) as gff_file:
        for line in gff_file:
            if hasFasta:
                fastaString += line
                continue
            elif line.startswith('##', 0, 2):
                if line.startswith('FASTA', 2, 7):
                    if not getSeq:  #if getting the sequences is useless...
                        break
                    hasFasta = True
                elif line.startswith('sequence-region', 2, 17):
                    fields = [el.strip() for el in line.split()]
                    contig = org.getOrAddContig(
                        fields[1],
                        True if fields[1] in circular_contigs else False)
                continue
            elif line.startswith(
                    '#!', 0, 2
            ):  ## special refseq comment lines for versionning softs, assemblies and annotations.
                continue
            gff_fields = [el.strip() for el in line.split('\t')]
            attributes = getGffAttributes(gff_fields)
            pseudogene = False
            if gff_fields[GFF_type] == 'region':
                if gff_fields[GFF_seqname] in circular_contigs:
                    contig.is_circular = True
            elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]:
                geneID = attributes.get(
                    "PROTEIN_ID"
                )  #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that.
                if geneID is None:  #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file)
                    geneID = getIDAttribute(attributes)
                try:
                    name = attributes.pop('NAME')
                except KeyError:
                    try:
                        name = attributes.pop('GENE')
                    except KeyError:
                        name = ""
                if "pseudo" in attributes or "pseudogene" in attributes:
                    pseudogene = True
                try:
                    product = attributes.pop('PRODUCT')
                except KeyError:
                    product = ""

                try:
                    genetic_code = attributes.pop("TRANSL_TABLE")
                except KeyError:
                    genetic_code = "11"
                if contig.name != gff_fields[GFF_seqname]:
                    contig = org.getOrAddContig(
                        gff_fields[GFF_seqname])  #get the current contig
                if gff_fields[GFF_type] == "CDS" and (not pseudogene or
                                                      (pseudogene and pseudo)):
                    gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4))

                    #here contig is filled in order, so position is the number of genes already stored in the contig.
                    gene.fill_annotations(start=int(gff_fields[GFF_start]),
                                          stop=int(gff_fields[GFF_end]),
                                          strand=gff_fields[GFF_strand],
                                          geneType=gff_fields[GFF_type],
                                          position=len(contig.genes),
                                          name=name,
                                          product=product,
                                          genetic_code=genetic_code,
                                          local_identifier=geneID)
                    gene.fill_parents(org, contig)
                    contig.addGene(gene)
                    geneCounter += 1
                elif "RNA" in gff_fields[GFF_type]:
                    rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4))
                    rna.fill_annotations(start=int(gff_fields[GFF_start]),
                                         stop=int(gff_fields[GFF_end]),
                                         strand=gff_fields[GFF_strand],
                                         geneType=gff_fields[GFF_type],
                                         name=name,
                                         product=product,
                                         local_identifier=geneID)
                    rna.fill_parents(org, contig)
                    contig.addRNA(rna)
                    rnaCounter += 1
    ### GET THE FASTA SEQUENCES OF THE GENES
    if hasFasta and fastaString != "":
        contigSequences = read_fasta(org, fastaString.split('\n'))
        for contig in org.contigs:
            for gene in contig.genes:
                gene.add_dna(
                    get_dna_sequence(contigSequences[contig.name], gene))
            for rna in contig.RNAs:
                rna.add_dna(get_dna_sequence(contigSequences[contig.name],
                                             rna))
    return org, hasFasta