Example #1
0
def getGeneSequencesFromFastas(pangenome, fasta_file):
    fastaDict = {}
    for line in read_compressed_or_not(fasta_file):
        elements = [el.strip() for el in line.split("\t")]
        if len(elements) <= 1:
            logging.getLogger().error(
                "No tabulation separator found in organisms file")
            exit(1)
        org = pangenome.addOrganism(elements[0])
        with read_compressed_or_not(elements[1]) as currFastaFile:
            fastaDict[org] = read_fasta(org, currFastaFile)
    if not set(pangenome.organisms) <= set(fastaDict.keys()):
        missing = len(pangenome.organisms) - len(
            set(pangenome.organisms) & set(fastaDict.keys()))
        raise Exception(
            f"Not all of your pangenome's organisms are present within the provided fasta file. {missing} are missing (out of {len(pangenome.organisms)})."
        )

    for org in pangenome.organisms:
        try:
            for contig in org.contigs:
                for gene in contig.genes:
                    gene.add_dna(
                        get_dna_sequence(fastaDict[org][contig.name], gene))
                for rna in contig.RNAs:
                    rna.add_dna(
                        get_dna_sequence(fastaDict[org][contig.name], gene))
        except KeyError:
            msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} that was read from the annotation file. "
            msg += f"The provided contigs in the fasta were : { ', '.join([contig for contig in fastaDict[org].keys()])}."
            raise KeyError(msg)
    pangenome.status["geneSequences"] = "Computed"
Example #2
0
def writeRegionsSequences(pangenome, output, compress, regions, fasta, anno, disable_bar=False):
    organisms_file = fasta if fasta is not None else anno
    org_dict = {}
    for line in read_compressed_or_not(organisms_file):
        elements = [el.strip() for el in line.split("\t")]
        if len(elements) <= 1:
            logging.getLogger().error(
                f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'")
            exit(1)
        org_dict[elements[0]] = elements[1]

    logging.getLogger().info(f"Writing {regions} rgp genomic sequences...")
    regions_to_write = []
    if regions == "complete":
        for region in pangenome.regions:
            if not region.isContigBorder:
                regions_to_write.append(region)
    else:
        regions_to_write = pangenome.regions

    regions_to_write = sorted(regions_to_write, key=lambda x: x.organism.name)
    # order regions by organism, so that we only have to read one genome at the time

    outname = output + f"/{regions}_rgp_genomic_sequences.fasta"
    with write_compressed_or_not(outname, compress) as fasta:
        loaded_genome = ""
        bar = tqdm(regions_to_write, unit="rgp", disable=disable_bar)
        for region in bar:
            if region.organism.name != loaded_genome:
                loaded_genome = region.organism.name
                genome_sequence = read_genome_file(org_dict, loaded_genome)
            fasta.write(f">{region.name}\n")
            fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60))
        bar.close()
    logging.getLogger().info(f"Done writing the regions nucleotide sequences: '{outname}'")
Example #3
0
def read_fasta_gbk(filename):
    # line.startswith("ORIGIN"):
    sequence_dict = {}
    lines = read_compressed_or_not(filename).readlines()[::-1]
    while len(lines) != 0:
        line = lines.pop()
        # beginning of contig
        if line.startswith('LOCUS'):
            contigLocusID = line.split()[1]
            # If contigID is not specified in VERSION afterwards like with Prokka, in that case we use the one in LOCUS.
            while not line.startswith('FEATURES'):
                if line.startswith('VERSION'):
                    contigID = line[12:].strip()
                line = lines.pop()
        if contigID == "":
            contigID = contigLocusID
        while not line.startswith("ORIGIN"):
            line = lines.pop()  # stuff
        line = lines.pop()  # first sequence line.
        sequence = ""
        while not line.startswith('//'):
            sequence += line[10:].replace(" ", "").strip().upper()
            line = lines.pop()
        # get each gene's sequence.
        sequence_dict[contigID] = sequence
        # end of contig
    return sequence_dict
Example #4
0
def annotate_organism(orgName, fileName, circular_contigs, code, kingdom, norna, tmpdir, overlap):
    """
        Function to annotate a single organism
    """
    org = Organism(orgName)

    fastaFile = read_compressed_or_not(fileName)
    contigSequences = read_fasta(org, fastaFile)
    if is_compressed(fileName):
        fastaFile = write_tmp_fasta(contigSequences, tmpdir)

    genes = syntaxic_annotation(org, fastaFile, norna, kingdom, code, tmpdir)
    genes = overlap_filter(genes, contigSequences, overlap)

    for contigName, genes in genes.items():
        contig = org.getOrAddContig(contigName)
        if contig.name in circular_contigs:
            contig.is_circular = True
        for gene in genes:
            gene.add_dna(get_dna_sequence(contigSequences[contig.name], gene))
            gene.fill_parents(org, contig)
            if isinstance(gene, Gene):
                contig.addGene(gene)
            elif isinstance(gene, RNA):
                contig.addRNA(gene)
    return org
Example #5
0
def readAnnotations(pangenome,
                    organisms_file,
                    cpu,
                    getSeq=True,
                    pseudo=False,
                    show_bar=True):
    logging.getLogger().info("Reading " + organisms_file +
                             " the list of organism files ...")

    pangenome.status[
        "geneSequences"] = "Computed"  #we assume there are gene sequences in the annotation files, unless a gff file without fasta is met (which is the only case where sequences can be asbent)
    args = []
    for line in read_compressed_or_not(organisms_file):
        elements = [el.strip() for el in line.split("\t")]
        if len(elements) <= 1:
            logging.getLogger().error(
                f"No tabulation separator found in given --fasta file: '{organisms_file}'"
            )
            exit(1)
        args.append((elements[0], elements[1], elements[2:], getSeq, pseudo))
    bar = tqdm(range(len(args)), unit="file", disable=not show_bar)
    with Pool(cpu) as p:
        for org, flag in p.imap_unordered(launchReadAnno, args):
            pangenome.addOrganism(org)
            if flag == False:
                pangenome.status["geneSequences"] = "No"
            bar.update()
    bar.close()

    pangenome.status["genomesAnnotated"] = "Computed"
    pangenome.parameters["annotation"] = {}
    pangenome.parameters["annotation"]["read_annotations_from_file"] = True
Example #6
0
def annotatePangenome(pangenome, fastaList, tmpdir, cpu, translation_table="11", kingdom = "bacteria", norna=False,  overlap=True):
    logging.getLogger().info(f"Reading {fastaList} the list of organism files")

    arguments = []
    for line in read_compressed_or_not(fastaList):
        elements = [el.strip() for el in line.split("\t")]
        if len(elements)<=1:
            logging.getLogger().error("No tabulation separator found in organisms file")
            exit(1)
        arguments.append((elements[0], elements[1], elements[2:], translation_table, kingdom, norna, tmpdir, overlap))
    if len(arguments) == 0:
        raise Exception("There are no genomes in the provided file")
    logging.getLogger().info(f"Annotating {len(arguments)} genomes using {cpu} cpus...")
    with Pool(processes = cpu) as p:
        bar = tqdm(range(len(arguments)), unit = "genome")
        for organism in p.imap_unordered(launchAnnotateOrganism, arguments):
            bar.update()
            pangenome.addOrganism(organism)
        p.close()
        p.join()
    bar.close()

    logging.getLogger().info("Done annotating genomes")
    pangenome.status["genomesAnnotated"] = "Computed"#the pangenome is now annotated.
    pangenome.status["geneSequences"] = "Computed"#the gene objects have their respective gene sequences.
    pangenome.parameters["annotation"] = {}
    pangenome.parameters["annotation"]["remove_Overlapping_CDS"] = overlap
    pangenome.parameters["annotation"]["annotate_RNA"] = True if not norna else False
    pangenome.parameters["annotation"]["kingdom"] = kingdom
    pangenome.parameters["annotation"]["translation_table"] = translation_table
    pangenome.parameters["annotation"]["read_annotations_from_file"] = False
Example #7
0
def readAnnotations(pangenome, organisms_file, cpu, pseudo=False, disable_bar=False):
    logging.getLogger().info("Reading " + organisms_file + " the list of organism files ...")

    pangenome.status["geneSequences"] = "Computed"
    # we assume there are gene sequences in the annotation files,
    # unless a gff file without fasta is met (which is the only case where sequences can be absent)
    args = []
    for line in read_compressed_or_not(organisms_file):
        elements = [el.strip() for el in line.split("\t")]
        if len(elements) <= 1:
            raise Exception(f"No tabulation separator found in given --fasta file: '{organisms_file}'")
        args.append((elements[0], elements[1], elements[2:], pseudo))
    bar = tqdm(range(len(args)), unit="file", disable=disable_bar)
    with get_context('fork').Pool(cpu) as p:
        for org, flag in p.imap_unordered(launchReadAnno, args):
            pangenome.addOrganism(org)
            if not flag:
                pangenome.status["geneSequences"] = "No"
            bar.update()
    bar.close()

    #decide whether or not we use local ids or ppanggolin ids.
    used_local_identifiers = choseGeneIdentifiers(pangenome)
    if used_local_identifiers:
        logging.getLogger().info("gene identifiers used in the provided annotation files were unique, PPanGGOLiN will use them.")
    else:
        logging.getLogger().info("gene identifiers used in the provided annotation files were not unique, PPanGGOLiN will use self-generated identifiers.")

    pangenome.status["genomesAnnotated"] = "Computed"
    pangenome.parameters["annotation"] = {}
    pangenome.parameters["annotation"]["used_local_identifiers"] = used_local_identifiers
    pangenome.parameters["annotation"]["read_pseudogenes"] = pseudo
    pangenome.parameters["annotation"]["read_annotations_from_file"] = True
Example #8
0
def readClustering(pangenome,
                   families_tsv_file,
                   infer_singletons=False,
                   force=False):
    """
        Creates the pangenome, the gene families and the genes with an associated gene family.
        Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
    """
    checkPangenomeFormerClustering(pangenome, force)
    checkPangenomeInfo(pangenome, needAnnotations=True)

    logging.getLogger().info("Reading " + families_tsv_file +
                             " the gene families file ...")
    filesize = os.stat(families_tsv_file).st_size
    families_tsv_file = read_compressed_or_not(families_tsv_file)
    frag = False
    #the genome annotations are necessarily loaded.
    nbGeneWtFam = 0
    bar = tqdm(total=filesize, unit="bytes")
    for line in families_tsv_file:
        bar.update(len(line))
        elements = [el.strip()
                    for el in line.split()]  # 2 or 3 fields expected
        if len(elements) <= 1:
            logging.getLogger().error(
                "No tabulation separator found in gene families file")
            exit(1)
        (fam_id, gene_id,
         is_frag) = elements if len(elements) == 3 else elements + [None]

        geneObj = pangenome.getGene(gene_id)
        if geneObj is not None:
            nbGeneWtFam += 1
            fam = pangenome.addGeneFamily(fam_id)
            geneObj.is_fragment = True if is_frag == "F" else False
            fam.addGene(geneObj)
        if is_frag == "F":
            frag = True
    bar.close()
    families_tsv_file.close()
    if nbGeneWtFam < len(
            pangenome.genes):  #not all genes have an associated cluster
        if nbGeneWtFam == 0:
            raise Exception(
                "No gene ID in the cluster file matched any gene ID from the annotation step. Please ensure that the annotations that you loaded previously and the clustering results that you have use the same gene IDs."
            )
        else:
            if infer_singletons:
                inferSingletons(pangenome)
            else:
                raise Exception(
                    "Some genes did not have an associated cluster. Either change your cluster file so that each gene has a cluster, or use the --infer_singletons option to infer a cluster for each non-clustered gene."
                )
    pangenome.status["genesClustered"] = "Computed"
    if frag:  #if there was fragment informations in the file.
        pangenome.status["defragmented"] = "Computed"
    pangenome.parameters["cluster"] = {}
    pangenome.parameters["cluster"]["read_clustering_from_file"] = True
    pangenome.parameters["cluster"]["infer_singletons"] = infer_singletons
Example #9
0
def detect_filetype(filename):
    """ detects whether the current file is gff3, gbk/gbff or unknown. If unknown, it will raise an error"""
    with read_compressed_or_not(filename) as f:
        firstLine = f.readline()
    if firstLine.startswith("LOCUS       "):#then this is probably a gbff/gbk file
        return "gbff"
    elif firstLine.startswith("##gff-version 3"):
        return 'gff'
    else:
        raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') nor gbff/gbk (file starts with 'LOCUS       '). Only those two file formats are supported (for now).")
Example #10
0
def align(pangenome,
          proteinFile,
          output,
          tmpdir,
          identity=0.8,
          coverage=0.8,
          defrag=False,
          cpu=1,
          getinfo=False,
          draw_related=False):
    if pangenome.status["geneFamilySequences"] not in [
            "inFile", "Loaded", "Computed"
    ]:
        raise Exception(
            "Cannot use this function as your pangenome does not have gene families representatives associated to it. For now this works only if the clustering is realised by PPanGGOLiN."
        )
    if getinfo:
        checkPangenomeInfo(pangenome,
                           needAnnotations=True,
                           needFamilies=True,
                           needRGP=True,
                           needPartitions=True,
                           needSpots=True)
    else:
        checkPangenomeInfo(pangenome, needFamilies=True)

    newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
    tmpPangFile = tempfile.NamedTemporaryFile(mode="w", dir=newtmpdir.name)

    writeGeneFamSequences(pangenome, tmpPangFile)

    with read_compressed_or_not(proteinFile) as protFileObj:
        protSet = getProt(protFileObj)
        alignFile = alignSeqToPang(tmpPangFile, protFileObj, output, newtmpdir,
                                   cpu, defrag, identity, coverage)

    prot2pang = readAlignments(alignFile, pangenome)

    if getinfo:
        getProtInfo(prot2pang, pangenome, output, cpu, draw_related)
    else:
        partProj = projectPartition(
            prot2pang, protSet, output)  #write the partition assignation only
        logging.getLogger().info(
            f"proteins partition projection : '{partProj}'")
    logging.getLogger().info(
        f"{len(prot2pang)} proteins over {len(protSet)} have at least one hit in the pangenome."
    )
    logging.getLogger().info(
        f"Blast-tab file of the alignment : '{alignFile}'")

    tmpPangFile.close()
    newtmpdir.cleanup()
Example #11
0
def get_seq2pang(pangenome,
                 sequenceFile,
                 output,
                 tmpdir,
                 cpu=1,
                 no_defrag=False,
                 identity=0.8,
                 coverage=0.8):
    """
    Assign a pangenome gene family to the input sequences.

    :param pangenome: Pangenome with gene families to align with the given input sequences
    :type pangenome: Pangenome
    :param sequenceFile: Sequences in a .fasta file to align with the given Pangenome
    :type sequenceFile: str
    :param output: Output directory
    :type output: str
    :param tmpdir: Temporary directory
    :type tmpdir: tempfile.TemporaryDirectory
    :param cpu: number of CPU cores to use 
    :type cpu: int
    :param no_defrag: do not use the defrag workflow if true
    :type no_defrag: Boolean
    :param identity: minimal identity threshold for the alignment
    :type identity: float
    :param coverage: minimal identity threshold for the alignment
    :type coverage: float

    :return: sequence set, blast-tab result file string, and sequences aligned with families
    :rtype: set, str, dic
    """
    tmpPangFile = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name)

    writeGeneFamSequences(pangenome, tmpPangFile, add="ppanggolin_")

    with read_compressed_or_not(sequenceFile) as seqFileObj:
        seqSet = getSeq(seqFileObj)
        alignFile = alignSeqToPang(tmpPangFile, seqFileObj, output, tmpdir,
                                   cpu, no_defrag, identity, coverage)

    seq2pang, alignFile = readAlignments(alignFile, pangenome)

    tmpPangFile.close()

    return seqSet, alignFile, seq2pang
Example #12
0
def writeGbffRegions(filename, regions, output):
    ContigRegions = defaultdict(set)
    for region in regions:
        ContigRegions[region.contig.name].add(region)

    for contigName in ContigRegions.keys():
        ContigRegions[contigName] = sorted(ContigRegions[region.contig.name],
                                           key=lambda x: x.start,
                                           reverse=True)

    foutfile = open(output + "/genome_annotation.gbff", "w")

    curr_contig = None
    with read_compressed_or_not(filename) as fannot:
        for line in fannot:
            if line.startswith("VERSION"):
                curr_contig = line.split()[1]
            if curr_contig in ContigRegions and len(
                    ContigRegions[curr_contig]) > 0:
                if line[0:5].strip(
                ) == "" and line[0:20].strip() != "" and len(line[20:].split(
                        "..")) == 2:  #should be a FEATURE with its position
                    start = line[20:].replace("complement(",
                                              "").replace(")",
                                                          "").split("..")[0]
                    if int(start) == ContigRegions[curr_contig][-1].start:
                        reg = ContigRegions[curr_contig].pop()
                        foutfile.write("     misc_feature    " +
                                       str(reg.start) + ".." + str(reg.stop) +
                                       "\n")
                        foutfile.write(
                            '                     /note="Region of genomic plasticity"\n'
                        )

            foutfile.write(line)

    for val in ContigRegions.values():
        if len(val) != 0:
            logging.getLogger().warning(
                "Somes regions were not written in the new gbff file for unknown reasons"
            )
    logging.getLogger().info(
        f"RGP have been written in the following file : '{output + '/genome_annotation.gbff'}' "
    )
Example #13
0
def writeGffRegions(filename, regions, output):
    ContigRegions = defaultdict(set)
    for region in regions:
        ContigRegions[region.contig.name].add(region)

    for contigName in ContigRegions.keys():
        ContigRegions[contigName] = sorted(ContigRegions[region.contig.name],
                                           key=lambda x: x.start,
                                           reverse=True)

    foutfile = open(output + "/genome_annotation.gff", "w")

    with read_compressed_or_not(filename) as fannot:
        for line in fannot:
            if line[0] == "#":
                pass
            else:
                features = line.split("\t")
                if len(
                        features
                ) == 9:  #gff annotation lines are supposed to be 8 columns long
                    start = int(features[3])
                    if features[0] in ContigRegions:
                        if len(ContigRegions[
                                features[0]]) > 0 and ContigRegions[
                                    features[0]][-1].start == start:
                            reg = ContigRegions[features[0]].pop()
                            foutfile.write('\t'.join(
                                map(str, [
                                    features[0], "panRGP", "sequence_feature",
                                    reg.start, reg.stop, reg.score, '+', '.',
                                    f'ID={region.name};note=Region of genomic plasticity;gbkey=misc_feature'
                                ])) + "\n")
            foutfile.write(line)

    for val in ContigRegions.values():
        if len(val) != 0:
            logging.getLogger().warning(
                "Somes regions were not written in the new gff file for unknown reasons"
            )
    logging.getLogger().info(
        f"RGP have been written in the following file : '{output + '/genome_annotation.gff'}' "
    )
Example #14
0
def read_fasta_or_gff(filename):
    sequence_dict = {}
    seqname = ""
    seq = ""
    z = False
    with read_compressed_or_not(filename) as f:
        for line in f:
            if line.startswith(">"):
                z = True
            if z:
                if line.startswith('>'):
                    if seq != "":
                        sequence_dict[seqname] = seq
                    seqname = line[1:].strip().split()[0]
                else:
                    seq += line.strip()
        if seq != "":
            sequence_dict[seqname] = seq
    return sequence_dict
Example #15
0
def readAnnotations(pangenome, organisms_file, getSeq = True, pseudo = False):
    logging.getLogger().info("Reading "+organisms_file+" the list of organism files ...")

    bar = tqdm(read_compressed_or_not(organisms_file),total=get_num_lines(organisms_file), unit = "annotation file")
    pangenome.status["geneSequences"] = "Computed"#we assume there are gene sequences in the annotation files, unless a gff file without fasta is met (which is the only case where sequences can be asbent)
    for line in bar:
        elements = [el.strip() for el in line.split("\t")]
        if len(elements)<=1:
            logging.getLogger().error(f"No tabulation separator found in given --fasta file: '{organisms_file}'")
            exit(1)
        bar.set_description("Processing "+elements[1].split("/")[-1])
        bar.refresh()
        filetype = detect_filetype(elements[1])
        if filetype == "gff":
            read_org_gff(pangenome, elements[0], elements[1], elements[2:], getSeq, pseudo)
        elif filetype == "gbff":
            read_org_gbff(pangenome, elements[0], elements[1], elements[2:], getSeq, pseudo)
    bar.close()
    pangenome.status["genomesAnnotated"] = "Computed"
    pangenome.parameters["annotation"] = {}
    pangenome.parameters["annotation"]["read_annotations_from_file"] = True
Example #16
0
def read_org_gbff(organism,
                  gbff_file_path,
                  circular_contigs,
                  getSeq,
                  pseudo=False):
    """ reads a gbff file and fills Organism, Contig and Genes objects based on information contained in this file """
    org = Organism(organism)

    logging.getLogger().debug(
        "Extracting genes informations from the given gbff")
    # revert the order of the file, to read the first line first.
    lines = read_compressed_or_not(gbff_file_path).readlines()[::-1]
    geneCounter = 0
    rnaCounter = 0
    while len(lines) != 0:
        line = lines.pop()
        # beginning of contig
        if line.startswith('LOCUS'):
            is_circ = False
            if "CIRCULAR" in line.upper(
            ):  #this line contains linear/circular word telling if the dna sequence is circularized or not
                is_circ = True
            contigLocusID = line.split(
            )[1]  #If contigID is not specified in VERSION afterwards like with Prokka, in that case we use the one in LOCUS.
            setContig = False
            while not line.startswith('FEATURES'):
                if line.startswith('VERSION'):
                    contigID = line[12:].strip()
                    if contigID != "":
                        if contigID in circular_contigs:
                            is_circ = True
                        contig = org.getOrAddContig(contigID, is_circ)
                        setContig = True
                line = lines.pop()
        if not setContig:  #if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. Should be unique in a dataset, but if there's an update the contig ID might still be the same even though it should not(?)
            if contigLocusID in circular_contigs:
                is_circ = True
            contig = org.getOrAddContig(contigLocusID, is_circ)
        # start of the feature object.
        dbxref = set()
        gene_name = ""
        product = ""
        locus_tag = ""
        objType = ""
        protein_id = ""
        genetic_code = ""
        usefulInfo = False
        start = None
        end = None
        strand = None
        line = lines.pop()
        while not line.startswith("ORIGIN"):
            currType = line[5:21].strip()
            if currType != "":
                if usefulInfo:
                    create_gene(org, contig, geneCounter, rnaCounter,
                                locus_tag, dbxref, start, end, strand, objType,
                                len(contig.genes), gene_name, product,
                                genetic_code, protein_id)
                    if objType == "CDS":
                        geneCounter += 1
                    else:
                        rnaCounter += 1
                usefulInfo = False
                objType = currType
                if objType in ['CDS', 'rRNA', 'tRNA']:
                    dbxref = set()
                    gene_name = ""
                    try:
                        if not 'join' in line[21:]:
                            usefulInfo = True
                            if line[21:].startswith('complement('):
                                strand = "-"
                                start, end = line[32:].replace(')',
                                                               '').split("..")
                            else:
                                strand = "+"
                                start, end = line[21:].strip().split('..')
                            if '>' in start or '<' in start or '>' in end or '<' in end:
                                usefulInfo = False
                    except ValueError:
                        pass
                        #don't know what to do with that, ignoring for now.
                        #there is a protein with a frameshift mecanism.
            elif usefulInfo:  # current info goes to current objtype, if it's useful.
                if line[21:].startswith("/db_xref"):
                    dbxref.add(line.split("=")[1].replace('"', '').strip())
                elif line[21:].startswith("/locus_tag"):
                    locus_tag = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith("/protein_id"):
                    protein_id = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith('/gene'):  #gene name
                    gene_name = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith('/transl_table'):
                    genetic_code = line.split("=")[1].replace('"', '').strip()
                elif line[21:].startswith(
                        '/product'
                ):  #need to loop as it can be more than one line long
                    product = line.split('=')[1].replace('"', '').strip()
                    if line.count(
                            '"'
                    ) == 1:  #then the product line is on multiple lines
                        line = lines.pop()
                        product += line.strip().replace('"', '')
                        while line.count('"') != 1:
                            line = lines.pop()
                            product += line.strip().replace('"', '')
                #if it's a pseudogene, we're not keeping it.
                elif line[21:].startswith("/pseudo") and not pseudo:
                    usefulInfo = False
                #that's probably a 'stop' codon into selenocystein.
                elif line[21:].startswith("/transl_except"):
                    usefulInfo = False
            line = lines.pop()
            #end of contig
        if usefulInfo:  #saving the last element...
            create_gene(org, contig, geneCounter, rnaCounter,
                        locus_tag, dbxref, start, end, strand, objType,
                        len(contig.genes), gene_name, product, genetic_code,
                        protein_id)
            if objType == "CDS":
                geneCounter += 1
            else:
                rnaCounter += 1
        if getSeq:
            line = lines.pop()  #first sequence line.
            #if the seq was to be gotten, it would be here.
            sequence = ""
            while not line.startswith('//'):
                sequence += line[10:].replace(" ", "").strip().upper()
                line = lines.pop()
            #get each gene's sequence.
            for gene in contig.genes:
                gene.add_dna(get_dna_sequence(sequence, gene))

    return org, True  #There are always fasta sequences in a gbff
Example #17
0
def readClustering(pangenome,
                   families_tsv_file,
                   infer_singletons=False,
                   force=False,
                   disable_bar=False):
    """
        Creates the pangenome, the gene families and the genes with an associated gene family.
        Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
    """
    checkPangenomeFormerClustering(pangenome, force)
    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       disable_bar=disable_bar)

    logging.getLogger().info("Reading " + families_tsv_file +
                             " the gene families file ...")
    filesize = os.stat(families_tsv_file).st_size
    families_tsv_file = read_compressed_or_not(families_tsv_file)
    frag = False
    # the genome annotations are necessarily loaded.
    nbGeneWithFam = 0
    localDict = mkLocal2Gene(pangenome)
    bar = tqdm(total=filesize, unit="bytes", disable=disable_bar)
    lineCounter = 0
    for line in families_tsv_file:
        lineCounter += 1
        bar.update(len(line))
        try:
            elements = [el.strip()
                        for el in line.split()]  # 2 or 3 fields expected
            if len(elements) <= 1:
                raise ValueError(
                    "No tabulation separator found in gene families file")
            (fam_id, gene_id,
             is_frag) = elements if len(elements) == 3 else elements + [None]
            try:
                geneObj = pangenome.getGene(gene_id)
            except KeyError:
                geneObj = localDict.get(gene_id)
            if geneObj is not None:
                nbGeneWithFam += 1
                fam = pangenome.addGeneFamily(fam_id)
                geneObj.is_fragment = True if is_frag == "F" else False
                fam.addGene(geneObj)
            if is_frag == "F":
                frag = True
        except:
            raise Exception(
                f"line {lineCounter} of the file '{families_tsv_file.name}' raised an error."
            )
    bar.close()
    families_tsv_file.close()
    if nbGeneWithFam < len(
            pangenome.genes):  # not all genes have an associated cluster
        if nbGeneWithFam == 0:
            raise Exception(
                "No gene ID in the cluster file matched any gene ID from the annotation step."
                " Please ensure that the annotations that you loaded previously and the clustering results "
                "that you have used the same gene IDs. If you use .gff files it is the identifier stored in"
                " the field 'ID'. If you use .gbff files it is the identifier stored in 'locus_tag'."
            )
        else:
            if infer_singletons:
                inferSingletons(pangenome)
            else:
                raise Exception(
                    f"Some genes ({len(pangenome.genes) - nbGeneWithFam}) did not have an associated "
                    f"cluster. Either change your cluster file so that each gene has a cluster, "
                    f"or use the --infer_singletons option to infer a cluster for each non-clustered gene."
                )
    pangenome.status["genesClustered"] = "Computed"
    if frag:  # if there was fragment information in the file.
        pangenome.status["defragmented"] = "Computed"
    pangenome.parameters["cluster"] = {}
    pangenome.parameters["cluster"]["read_clustering_from_file"] = True
    pangenome.parameters["cluster"]["infer_singletons"] = infer_singletons
Example #18
0
def read_org_gff(organism,
                 gff_file_path,
                 circular_contigs,
                 getSeq,
                 pseudo=False):
    (GFF_seqname, _,
     GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range(
         0, 9)  #missing values : source, score, frame. They are unused.

    def getGffAttributes(gff_fields):
        """
            Parses the gff attribute's line and outputs the attributes in a dict structure.
            :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff.
            :type list:
            :return: attributes:
            :rtype: dict
        """
        attributes_field = [
            f for f in gff_fields[GFF_attribute].strip().split(';')
            if len(f) > 0
        ]
        attributes = {}
        for att in attributes_field:
            try:
                (key, value) = att.strip().split('=')
                attributes[key.upper()] = value
            except ValueError:
                pass  #we assume that it is a strange, but useless field for our analysis
        return attributes

    def getIDAttribute(attributes):
        """
            Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found.
            :param attribute:
            :type dict:
            :return: ElementID:
            :rtype: string
        """
        ElementID = attributes.get("ID")
        if not ElementID:
            logging.getLogger().error(
                "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: "
                + gff_file_path)
            exit(1)
        return ElementID

    hasFasta = False
    fastaString = ""
    org = Organism(organism)
    geneCounter = 0
    rnaCounter = 0
    with read_compressed_or_not(gff_file_path) as gff_file:
        for line in gff_file:
            if hasFasta:
                fastaString += line
                continue
            elif line.startswith('##', 0, 2):
                if line.startswith('FASTA', 2, 7):
                    if not getSeq:  #if getting the sequences is useless...
                        break
                    hasFasta = True
                elif line.startswith('sequence-region', 2, 17):
                    fields = [el.strip() for el in line.split()]
                    contig = org.getOrAddContig(
                        fields[1],
                        True if fields[1] in circular_contigs else False)
                continue
            elif line.startswith(
                    '#!', 0, 2
            ):  ## special refseq comment lines for versionning softs, assemblies and annotations.
                continue
            gff_fields = [el.strip() for el in line.split('\t')]
            attributes = getGffAttributes(gff_fields)
            pseudogene = False
            if gff_fields[GFF_type] == 'region':
                if gff_fields[GFF_seqname] in circular_contigs:
                    contig.is_circular = True
            elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]:
                geneID = attributes.get(
                    "PROTEIN_ID"
                )  #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that.
                if geneID is None:  #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file)
                    geneID = getIDAttribute(attributes)
                try:
                    name = attributes.pop('NAME')
                except KeyError:
                    try:
                        name = attributes.pop('GENE')
                    except KeyError:
                        name = ""
                if "pseudo" in attributes or "pseudogene" in attributes:
                    pseudogene = True
                try:
                    product = attributes.pop('PRODUCT')
                except KeyError:
                    product = ""

                try:
                    genetic_code = attributes.pop("TRANSL_TABLE")
                except KeyError:
                    genetic_code = "11"
                if contig.name != gff_fields[GFF_seqname]:
                    contig = org.getOrAddContig(
                        gff_fields[GFF_seqname])  #get the current contig
                if gff_fields[GFF_type] == "CDS" and (not pseudogene or
                                                      (pseudogene and pseudo)):
                    gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4))

                    #here contig is filled in order, so position is the number of genes already stored in the contig.
                    gene.fill_annotations(start=int(gff_fields[GFF_start]),
                                          stop=int(gff_fields[GFF_end]),
                                          strand=gff_fields[GFF_strand],
                                          geneType=gff_fields[GFF_type],
                                          position=len(contig.genes),
                                          name=name,
                                          product=product,
                                          genetic_code=genetic_code,
                                          local_identifier=geneID)
                    gene.fill_parents(org, contig)
                    contig.addGene(gene)
                    geneCounter += 1
                elif "RNA" in gff_fields[GFF_type]:
                    rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4))
                    rna.fill_annotations(start=int(gff_fields[GFF_start]),
                                         stop=int(gff_fields[GFF_end]),
                                         strand=gff_fields[GFF_strand],
                                         geneType=gff_fields[GFF_type],
                                         name=name,
                                         product=product,
                                         local_identifier=geneID)
                    rna.fill_parents(org, contig)
                    contig.addRNA(rna)
                    rnaCounter += 1
    ### GET THE FASTA SEQUENCES OF THE GENES
    if hasFasta and fastaString != "":
        contigSequences = read_fasta(org, fastaString.split('\n'))
        for contig in org.contigs:
            for gene in contig.genes:
                gene.add_dna(
                    get_dna_sequence(contigSequences[contig.name], gene))
            for rna in contig.RNAs:
                rna.add_dna(get_dna_sequence(contigSequences[contig.name],
                                             rna))
    return org, hasFasta