Beispiel #1
0
    def create_features_from_contig(
            seqrecord,
            source,
            type_map={x: x
                      for x in NCBI.ftypes},
            extract_annotation_feature=lambda feature: feature):
        ftypes = {xx: 1 for xx in type_map}
        features = []
        gene_ids = {}
        for feature in seqrecord.features:
            f = extract_annotation_feature(feature)
            if f.type in ftypes:

                fid = f.qualifiers["description"][
                    0] if "description" in f.qualifiers else f.id
                if "ID" in f.qualifiers:
                    fid = f.qualifiers["ID"][0]
                if "product" in f.qualifiers:
                    fid = f.qualifiers["product"][0]
                if "gene_id" in f.qualifiers:
                    fid = f.qualifiers["gene_id"][0]
                if "gene" in f.qualifiers:
                    fid = f.qualifiers["gene"][0]
                if "protein_id" in f.qualifiers:
                    fid = f.qualifiers["protein_id"][0]
                if "tRNA_anti-codon" in f.qualifiers:
                    fid = fid + " -> " + f.qualifiers["tRNA_anti-codon"][0]

                fdoc = Feature(_id=ObjectId(),
                               identifier=fid,
                               location=Location(start=f.location.start,
                                                 end=f.location.end,
                                                 strand=f.location.strand),
                               type=type_map[feature.type])

                if "locus_tag" in f.qualifiers:
                    locus_tag = f.qualifiers["locus_tag"][0]
                    fdoc.identifier = locus_tag
                    fdoc.locus_tag = locus_tag
                    fdoc.alias.append(fdoc.locus_tag)
                else:
                    fdoc.locus_tag = fid

                gene_ids[fdoc.locus_tag] = fdoc._id
                if "gene" in f.qualifiers:
                    fdoc.alias.append(f.qualifiers["gene"][0])
                if "protein_id" in f.qualifiers:
                    fdoc.alias.append(f.qualifiers["protein_id"][0])

                if "old_locus_tag" in f.qualifiers:
                    fdoc.alias = fdoc.alias + f.qualifiers["old_locus_tag"]
                if source:
                    fdoc.source = source
                features.append(fdoc)
        return (features, gene_ids)
Beispiel #2
0
def load_hmm(organism,
             hmm_file,
             transform_query_regexp=None,
             transform_hit_regexp=None):
    assert os.path.exists(hmm_file)
    for query in tqdm(bpsio.parse(hmm_file, 'hmmer3-text')):
        for hit in query:
            for hsp in hit:
                gene = query.id
                if transform_query_regexp:
                    gene = re.search(transform_query_regexp, query.id,
                                     re.IGNORECASE).group(1)

                hit_name = hit.id
                if transform_query_regexp:
                    hit_name = re.search(transform_hit_regexp, hit_name,
                                         re.IGNORECASE).group(1)

                proteins = Protein.objects(
                    organism=organism, alias=gene).no_cache().timeout(False)
                for protein in proteins:
                    dn = [
                        d for d in protein.domains()
                        if (d.identifier == hit_name) and (
                            d.location.start == hsp.query_start) and (
                                d.location.end == hsp.query_end)
                    ]
                    if dn:
                        protein.features.remove(dn[0])

                    hsp_feature = Feature(
                        _id=ObjectId(),
                        location=Location(start=hsp.query_start,
                                          end=hsp.query_end),
                        aln=SimpleAlignment(
                            evalue=hsp.evalue,
                            aln_query=AlnLine(name=hsp.query_id,
                                              seq=str(hsp.aln[0].seq),
                                              start=hsp.query_start,
                                              end=hsp.query_end),
                            aln_hit=AlnLine(name=hsp.hit.id,
                                            seq=str(hsp.aln[1].seq),
                                            start=hsp.hit_start,
                                            end=hsp.hit_end),
                            aln_cd=hsp.aln_annotation["CS"]
                            if "CS" in hsp.aln_annotation else "",
                            aln_pp=hsp.aln_annotation["PP"]
                            if "PP" in hsp.aln_annotation else ""),
                        identifier=hsp.hit.id,
                        type=SO_TERMS["polypeptide_domain"])

                    protein.features.append(hsp_feature)
                    protein.save()
Beispiel #3
0
    def load_from_interpro(self, organism, interprot_gff):
        for l in tqdm(open(interprot_gff)):
            if l.startswith(">"):
                break
            if l.startswith("##"):
                continue
            l = l.replace("EC=", "EC ")
            locus_tag, source, feature, start, end, score, strand, frame = l.split(
                "\t")[:8]
            attributes = " ".join(l.split("\t")[8:])

            if feature == "polypeptide":
                continue

            start, end = int(start), int(end)

            if "signature_desc=" in attributes:
                repl = attributes.split("signature_desc=")[1].split(
                    ";Name=")[0]
                attributes = attributes.replace(
                    repl,
                    repl.replace("=", "%3D").replace(";", "%3B"))

            attributes = {
                x.split("=")[0]: x.split("=")[1]
                for x in attributes.split(";")
            }
            # [seq,source,feature,start,end,score,strand,frame,attributes ])
            feature = Feature(_id=ObjectId(),
                              location=Location(start=start, end=end),
                              identifier=attributes["Name"],
                              type=source)
            prot = Protein.objects(organism=organism, gene=locus_tag).get()

            if "signature_desc" in attributes:
                feature.qualifiers = {
                    "description": attributes["signature_desc"]
                }
            if "Ontology_term" in attributes:
                for ont in attributes["Ontology_term"].split(","):
                    ont = ont.replace('"', "").strip()
                    prot.ontologies.append(ont.lower())
            if "Dbxref" in attributes:
                for ont in attributes["Dbxref"].split(","):
                    ont = ont.replace('"', "").strip()
                    prot.ontologies.append(ont.lower())

            prot.features.append(feature)
            prot.save()
Beispiel #4
0
 def feature_from_hsp(hsp, feature_type):
     return Feature(_id=ObjectId(),
                    location=Location(start=hsp.query_start,
                                      end=hsp.query_end),
                    aln=SimpleAlignment(
                        evalue=hsp.evalue,
                        aln_query=AlnLine(name=hsp.query.id,
                                          seq=str(hsp.aln[0].seq),
                                          start=hsp.query_start,
                                          end=hsp.query_end),
                        aln_hit=AlnLine(name=hsp.hit.id,
                                        seq=str(hsp.aln[1].seq),
                                        start=hsp.hit_start,
                                        end=hsp.hit_end),
                        aln_mid=hsp.aln_annotation["similarity"]
                        if "similarity" in hsp.aln_annotation else ""),
                    identifier=hsp.hit.id,
                    type=feature_type)
Beispiel #5
0
    def _process_prot(self, prot, r, i):

        try:
            pos = int(r.Substitution.split(",")[0][1:-1]) - 1
            start = pos
            end = pos
        except:
            if r.Substitution == "Deletions":
                start = 0
                end = len(prot.seq) - 1
            else:
                _log.warn("error parsing subtitution position: %s -> %s" %
                          (r["Core gene"], r["Substitution"]))
                return

        quals = {
            "drug": r.Antibiotic,
            "change": r.Substitution,
            "gene": r["Core gene"]
        }
        if r.Reference:
            quals["reference"] = r.Reference

        fvariant = Feature(_id=ObjectId(),
                           location=Location(start=start, end=end),
                           type="Aanensen2016",
                           identifier="Aanensen2016_ " + str(i),
                           qualifiers=quals)
        prot.features.append(fvariant)
        prot.save()
        if self.user == "demo":
            prot.search.resistance = True
            prot.search[r.Antibiotic.lower()] = True
            prot.save()
        else:
            self.db.proteins.update({"_id": prot.id}, {
                "$set": {
                    "search." + self.user + ".resistance": True,
                    "search." + self.user + "." + r.Antibiotic.lower(): True
                }
            })
Beispiel #6
0
def from_ref_seq(
        name,
        ann_path,
        seqs=None,
        tax=None,
        tmp_dir=None,
        extract_annotation_feature=lambda feature: feature.sub_features[0]
    if feature.type == "gene" and hasattr(feature, "sub_features") and len(
        feature.sub_features) else feature,
        accept_protein_feature=lambda f: (
            (f.type == "CDS") and ("translation" in f.qualifiers)),
        extract_sequence=lambda c, f: f.qualifiers["translation"][0]
    if "translation" in f.qualifiers else f.extract(c).seq.translate(),
        cpus=1):
    if seqs:
        seqs = {r.id: r.seq for r in bpio.parse(seqs, "fasta")}

    iter_seqs = list(sp(ann_path, seqs=seqs) if seqs else sp(ann_path))
    for contig in iter_seqs:
        if has_tax:
            seqCol = BioDocFactory.create_genome(name, contig, tax, Tax)
        else:
            seqCol = BioDocFactory.create_genome(name, contig)
        seqCol.save()
        break
    if not tmp_dir:
        tmp_dir = "/tmp/" + name + "/"
    mkdir(tmp_dir)
    gene_ids = {}
    with tqdm(iter_seqs) as pbar:
        for contig in pbar:
            pbar.set_description(contig.id)
            if len(contig.seq) > 15000000:
                contig.seq = ""
            contigDoc, gene_ids2 = BioDocFactory.create_contig(
                contig,
                seqCol,
                type_map={
                    "rRNA": "rRNA",
                    "ncRNA": "ncRNA",
                    NCBI.f_mRNA: NCBI.f_mRNA,
                    "gene": "gene",
                    NCBI.f_CDS: NCBI.f_CDS,
                    "rRNA": "rRNA",
                    "tRNA": "tRNA",
                    "tmRNA": "tmRNA"
                },
                extract_annotation_feature=extract_annotation_feature,
            )
            gene_ids.update(gene_ids2)
            contigDoc.save()

    prots = []

    with tqdm(
            _protein_iter(
                iter_seqs,
                accept_feature=accept_protein_feature,
                extract_annotation_feature=extract_annotation_feature,
                extract_sequence=extract_sequence)) as pbar:
        for (protein, cds_f) in pbar:
            if "locus_tag" in cds_f.qualifiers:
                protDoc = BioDocFactory.create_protein(protein, cds_f)
                if len(protDoc.seq) > 30000:
                    raise Exception("No existen proteinas tan largas...")
                if protDoc.seq.count("*") > 1:
                    print(
                        f"{cds_f.qualifiers['locus_tag'][0]}: Too many stop codons!"
                    )
                    continue
                if protDoc.seq.count("+") > 1:
                    print(
                        f"{cds_f.qualifiers['locus_tag'][0]}: + signs found...!"
                    )
                    continue
                protDoc.gene_id = gene_ids[cds_f.qualifiers["locus_tag"][0]]
                protDoc.organism = name
                protDoc.auth = str(BioMongoDB.demo_id)
                protDoc.seq_collection_id = seqCol
                for f in protein.features:
                    protDoc.features.append(
                        Feature(identifier=f.qualifiers["Ontology_term"][0],
                                type=f.type,
                                location=Location(start=int(f.location.start),
                                                  end=int(f.location.end))))

                prots.append(protDoc)
                if pbar.n and ((pbar.n % 1000) == 0):
                    Protein.objects.insert(prots)
                    prots = []
    if prots:
        Protein.objects.insert(prots)

    # _common_annotations(name, tmp_dir, cpu=cpus)
    return seqCol
Beispiel #7
0
    def load_in_sndg(self, organism="H37Rv"):
        from SNDG.BioMongo.Model.Protein import Protein
        from SNDG.BioMongo.Model.Feature import Feature, Location
        from SNDG.BioMongo.Model.SeqCollection import SeqCollection
        from SNDG.BioMongo.Model.SeqColDruggabilityParam import SeqColDruggabilityParamTypes, SeqColDruggabilityParam

        from bson.objectid import ObjectId

        search_params = [("resistance", "Associated with resistance", "variant-db",
                          SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg")

                         ]
        search_params = search_params + [
            (x, "Associated with " + x + " resistance", "variant-db",
             SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg")
            for x in TBDream.drugs
        ]

        Protein.objects(organism=organism).update(__raw__={"$pull": {"features": {"type": "tbdream"}}})
        collection = SeqCollection.objects(name=organism).get()
        for name, description, target, _type, options, defaultValue, defaultOperation, defaultGroupOperation in search_params:
            Protein.objects(organism=organism).update(__raw__={"$set": {"search." + name: False}})
            if not collection.has_druggability_param(name):
                dp = SeqColDruggabilityParam(name=name, description=description, target=target,
                                             type=_type, uploader="demo")
                dp.options = options
                dp.defaultValue = defaultValue
                dp.defaultOperation = defaultOperation
                dp.defaultGroupOperation = defaultGroupOperation
                collection.druggabilityParams.append(dp)
        collection.save()

        for rv, rows in self._df.groupby("rv"):
            prot = list(Protein.objects(organism=organism, gene__iexact=rv))
            if prot:
                prot = prot[0]
                for _, r in rows.iterrows():
                    mut = None
                    if r.change:
                        change = str(r.change[0]) + "/" + str(r.change[1])
                        mut = SeqUtils.seq1(r.change[1])
                    else:
                        change = r.AminoAcid
                    if math.isnan(r.codon):
                        try:
                            pos = int(r.AminoAcid)
                        except:
                            _log.warn("couldnt find the variant position")
                            continue
                    else:
                        pos = int(r.codon)

                    try:
                        res, t = r.RTotalIsolates.strip().split("/")
                        r_div_total_coef = int(res) * 1.0 / int(t)
                        r_div_total = r.RTotalIsolates.strip()

                    except:
                        r_div_total = None
                        r_div_total_coef = None

                    quals = {
                        "drug": r.Drug,
                        "change": change,
                        "gene": r.GeneID,
                        "pattern": r.ResistancePattern,
                        "additional": r.AdditionalMutations,
                        "r_div_total": r_div_total,
                        "r_div_total_coef": r_div_total_coef,
                        "mic": r.MIC}
                    if mut:
                        quals["mut"] = mut
                    fvariant = Feature(_id=ObjectId(), location=Location(start=pos, end=pos), type="tbdream",
                                       identifier="TBDream id " + r.ID,
                                       qualifiers=quals)
                    prot.features.append(fvariant)
                    prot.search.resistance = True
                    prot.search[r.Drug] = True
                prot.save()