Ejemplo n.º 1
0
    def matching_features(self, gfe1, gfe2, structures):
        """
        creates GFE from HLA sequence and locus

        :param locus: string containing HLA locus.
        :param sequence: string containing sequence data.

        :return: GFEobject.
        """

        # TODO: Update looping
        gfe_parts1 = self.breakup_gfe(gfe1)
        gfe_parts2 = self.breakup_gfe(gfe2)
        feat_list = list()
        for feat in gfe_parts1:
            if feat in gfe_parts2:
                if gfe_parts1[feat] == gfe_parts2[feat]:
                    if not feat == "FIVE_PRIME_UTR" and not feat == "THREE_PRIME_UTR":
                        feat_term, feat_rank = feat.split('-')
                        shared_feat = Feature(term=feat_term.upper(),
                                              rank=feat_rank,
                                              sequence=structures[feat],
                                              accession=gfe_parts1[feat])
                        feat_list.append(shared_feat)
                    else:
                        shared_feat = Feature(
                            term=feat,
                            rank=1,
                            sequence=structures["-".join([feat, str(1)])],
                            accession=gfe_parts1["-".join([feat, str(1)])])
                        feat_list.append(shared_feat)
        return (feat_list)
Ejemplo n.º 2
0
    def sequence_lookup(self, locus, sequence, imgtdb_version):
        """
        Looks up sequence from

        :param locus: string containing HLA locus.
        :param sequence: string containing sequence data.

        :return: GFEobject.
        """
        # TODO: just initalize seq2hla as DataFrame
        if isinstance(self.seq2hla, DataFrame) and not self.seq2hla.empty:
            df = self.seq2hla[(self.seq2hla['DB'] == imgtdb_version)
                              & (self.seq2hla['LOC'] == locus) &
                              (self.seq2hla['SEQ'] == sequence)]
            if not df.empty:
                hla = df['HLA'].tolist()[0]
                gfe = df['GFE'].tolist()[0]
                features = []
                feats = self.gfe_feats[(self.gfe_feats['GFE'] == gfe)
                                       & (self.gfe_feats['DB'] ==
                                          imgtdb_version)]['FEATS'].tolist()[0]
                if feats and len(feats) > 0:
                    for feat in feats:
                        feature = Feature(accession=feat['accession'],
                                          rank=feat['rank'],
                                          sequence=feat['sequence'],
                                          term=lc(feat['term']))
                        features.append(feature)
                else:
                    seq_features = self.graph.run(
                        get_features(gfe)).to_data_frame()
                    for i in range(0, len(seq_features['term'])):
                        feature = Feature(
                            accession=seq_features['accession'][i],
                            rank=seq_features['rank'][i],
                            sequence=seq_features['sequence'][i],
                            term=lc(seq_features['term'][i]))
                        features.append(feature)
                return [hla, gfe, features]
        else:
            lookup_query = sequence_search(locus, sequence)
            sequence_data = self.graph.run(lookup_query).to_data_frame()
            if not sequence_data.empty:
                features = list()
                gfe = list(set([x for x in sequence_data["GFE"]]))
                hla = list(set([x for x in sequence_data["HLA"]]))
                seq_features = self.graph.run(get_features(
                    gfe[0])).to_data_frame()
                for i in range(0, len(seq_features['term'])):
                    feature = Feature(accession=seq_features['accession'][i],
                                      rank=seq_features['rank'][i],
                                      sequence=seq_features['sequence'][i],
                                      term=lc(seq_features['term'][i]))
                    features.append(feature)
                return [hla[0], gfe[0], features]
            else:
                return
Ejemplo n.º 3
0
def gfeNotation_post(sequence, locus, gene):
    """
    gfeNotation_post
        GFE notations associated with the sequence

        :param locus: Valid HLA locus
        :param sequence: Valid sequence
        :param gene : Kir true or false
        :rtype: Feature and gfe
    """
    kir = gene
    sequence = SeqRecord(seq=Seq(sequence['sequence']))
    log_capture_string = io.StringIO()
    logger = logging.getLogger('')
    logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)

    # create console handler and set level to debug
    ch = logging.StreamHandler(log_capture_string)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)-35s - %(levelname)-5s '
        '- %(funcName)s %(lineno)d: - %(message)s')
    ch.setFormatter(formatter)
    ch.setLevel(logging.INFO)
    logger.addHandler(ch)

    gfe = GFE()
    if kir:
        seqann = BioSeqAnn(kir=True)
    else:
        seqann = BioSeqAnn()

    try:
        annotation = seqann.annotate(sequence)
    except Exception as e:
        print(e)
        log_contents = log_capture_string.getvalue()
        return Error("An error occured during the annotation",
                     log=log_contents.split("\n")), 404
    try:
        res_feature, res_gfe = gfe.get_gfe(annotation, locus)
    except Exception as e:
        print(e)
        log_contents = log_capture_string.getvalue()
        return Error("An error occurred in getting the gfe of annotation",
                     log=log_contents.split("\n")), 404
    feats = []
    for f in res_feature:
        fn = Feature(accession=f.accession,
                     rank=f.rank,
                     term=f.term,
                     sequence=f.sequence)
        feats.append(fn)
    return {'gfe': res_gfe, 'feature': feats}
def gfecreate_post(locus,
                   sequence,
                   imgt_version,
                   neo4j_url=neo_dict['neo4j_url'],
                   user=neo_dict['user'],
                   password=neo_dict['password']):  # noqa: E501
    """gfecreate_post

    Get all features associated with a locus

    :param locus: Valid HLA locus
    :param sequence: Valid sequence
    :param imgt_version : db version
    :rtype: Typing
    """
    imgthla_version = imgt_version
    global seqanns
    global gfe_feats
    global gfe2hla
    global seq2hla
    pygfe = pyGFE()
    sequence = sequence['sequence']
    log_capture_string = io.StringIO()
    logger = logging.getLogger('')
    logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)

    # create console handler and set level to debug
    ch = logging.StreamHandler(log_capture_string)
    formatter = logging.Formatter('%(asctime)s - %(name)-35s - %(levelname)-5s'
                                  ' - %(funcName)s %(lineno)d: - %(message)s')
    ch.setFormatter(formatter)
    ch.setLevel(logging.INFO)
    logger.addHandler(ch)

    if not re.match(".", imgthla_version):
        imgthla_version = ".".join([
            list(imgthla_version)[0], "".join(list(imgthla_version)[1:3]),
            list(imgthla_version)[3]
        ])

    db = "".join(imgthla_version.split("."))
    if db in seqanns:
        seqann = seqanns[db]
    else:
        seqann = BioSeqAnn(verbose=True,
                           safemode=True,
                           dbversion=db,
                           verbosity=3)
        seqanns.update({db: seqann})
    try:
        graph = Graph(neo4j_url, user=user, password=password, bolt=False)
    except ServiceUnavailable as err:
        log_contents = log_capture_string.getvalue()
        log_data = log_contents.split("\n")
        log_data.append(str(err))
        return Error("Failed to connect to graph", log=log_data), 404

    if (not isinstance(gfe_feats, DataFrame)
            or not isinstance(seq2hla, DataFrame)):
        pygfe = pyGFE(graph=graph,
                      seqann=seqann,
                      load_gfe2hla=True,
                      load_seq2hla=True,
                      load_gfe2feat=True,
                      verbose=True)
        gfe_feats = pygfe.gfe_feats
        seq2hla = pygfe.seq2hla
        gfe2hla = pygfe.gfe2hla
    else:
        pygfe = pyGFE(graph=graph,
                      seqann=seqann,
                      gfe2hla=gfe2hla,
                      gfe_feats=gfe_feats,
                      seq2hla=seq2hla,
                      verbose=True)
    try:
        typing = pygfe.gfe_create(locus=locus,
                                  sequence=sequence,
                                  imgtdb_version=db)
    except Exception as e:
        print(e)
        log_contents = log_capture_string.getvalue()
        return Error("Type with alignment failed",
                     log=log_contents.split("\n")), 404

    if isinstance(typing, Error):
        log_contents = log_capture_string.getvalue()
        typing.log = log_contents.split("\n")
        return typing, 404

    if not typing:
        log_contents = log_capture_string.getvalue()
        return Error("Type with alignment failed",
                     log=log_contents.split("\n")), 404
    structute_feats = []
    for f in typing['structure']:
        fn = Feature(accession=f.accession,
                     rank=f.rank,
                     term=f.term,
                     sequence=f.sequence)
        structute_feats.append(fn)
    anno_feats = []
    for f in typing['annotation'].structure:
        fn = Feature(accession=f.accession,
                     rank=f.rank,
                     term=f.term,
                     sequence=f.sequence)
        anno_feats.append(fn)
    return {
        'gfe': typing['gfe'],
        'feature': structute_feats,
        'annotation_feature': anno_feats
    }
Ejemplo n.º 5
0
    def type_from_seq(self,
                      locus: str = None,
                      sequence: str = None,
                      imgtdb_version: str = "3.31.0",
                      nseqs: int = 20,
                      alignseqs: int = 10,
                      skip: List = []):
        """
        creates GFE from HLA sequence and locus

        :param locus: string containing HLA locus.
        :param sequence: string containing sequence data.

        :return: GFEobject.
        """

        # TODO: Add full gene accession
        # TODO: reformt dbversion if missing .
        ac_object = Typing()
        ac_object.imgtdb_version = "".join(imgtdb_version.split("."))
        ac_object.pygfe_version = pygfe.__version__
        ac_object.seqann_version = seqann.__version__
        ac_object.gfedb_version = '0.0.2'

        # If sequence is now a biopython
        # sequence record convert it to one
        if isinstance(sequence, Seq):
            sequence = str(sequence)
        elif (isinstance(sequence, SeqRecord)):
            sequence = str(sequence.seq)

        if not ac_object.imgtdb_version in self.seqann:
            self.seqann.update({
                ac_object.imgtdb_version:
                BioSeqAnn(
                    dbversion=ac_object.imgtdb_version,
                    #store_features=self.store_features,
                    load_features=self.load_features,
                    cached_features=self.cached_features)
            })

        # If sequence contains any characters
        # other than ATCG then the GFE notation
        # can not be created
        valid_seq = checkseq(sequence)

        if self.verbose and not valid_seq:
            self.logger.warning(self.logname + " Sequence alphabet " +
                                "contains non DNA")
            self.logger.warning(self.logname +
                                " No GFE string will be generated")
            raise Exception(
                "Input sequence was not valid! {}".format(sequence))

        # Check it the locus exists
        if not locus:
            if self.verbose:
                self.logger.info(self.logname + " No locus provided! ")

            # Guessing locus with blastn
            locus = get_locus(
                sequence,
                kir=self.kir,
                refdata=self.seqann[ac_object.imgtdb_version].refdata)

            if locus and self.verbose:
                self.logger.info(self.logname + " Locus prediction = " + locus)

            if not locus:
                if self.verbose:
                    self.logger.error(self.logname +
                                      " Locus could not be determined!")
                # TODO: Raise exception
                raise Exception(
                    "Locus could not be determined! {}".format(sequence))

        sequence = sequence.upper()
        sequence_typing = self.sequence_lookup(locus, sequence,
                                               ac_object.imgtdb_version)
        if sequence_typing:
            ac_object.status = "documented"
            ac_object.hla = sequence_typing[0]
            ac_object.gfe = sequence_typing[1]
            ac_object.closest_gfe = sequence_typing[1]
            ac_object.features = sequence_typing[2]

            if self.verbose:
                self.logger.info(self.logname + locus +
                                 " sequence documented for " + imgtdb_version +
                                 " | " + ac_object.gfe + " = " + ac_object.hla)

            return ac_object
        else:
            # time GFE creation
            time_start = time.time()
            gfe_o = self.gfe_create(locus, sequence, ac_object.imgtdb_version)
            if not 'annotation' in gfe_o:
                self.logger.error(self.logname +
                                  "Failed to create annotation!!")
                error = Error("Failed to create annotation!!",
                              ac_object.pygfe_version, ac_object.gfedb_version,
                              imgtdb_version)
                return error

            if self.verbose:
                time_taken = int(time.time() - time_start)
                self.logger.info(self.logname + " gfe_create time for " +
                                 locus + " " + imgtdb_version + " = " +
                                 str(time_taken) + " minutes")
            annotation = gfe_o['annotation']
            ac_object.gfe = gfe_o['gfe']
            ac_object.features = [
                Feature(accession=f.accession,
                        rank=f.rank,
                        sequence=f.sequence,
                        term=f.term) for f in gfe_o['structure']
            ]
            novel_features = self.unique_features(ac_object.features, locus,
                                                  ac_object.imgtdb_version)
            if (len(novel_features) != 0):
                if self.verbose:
                    self.logger.info(self.logname + " # novel features = " +
                                     str(len(novel_features)))
                ac_object.novel_features = novel_features
                ac_object.status = "novel"
            else:
                self.logger.info(self.logname + " novel combination")
                ac_object.status = "novel_combination"

            similar_results = self.find_similar(ac_object.gfe,
                                                ac_object.features,
                                                imgtdb_version)
            if similar_results:
                ac_object.hla = similar_results[0]
                ac_object.closest_gfe = similar_results[1]
                if self.seqann[ac_object.imgtdb_version].align:
                    if self.verbose:
                        self.logger.info(self.logname +
                                         " finding sequence differences")
                    ac_object.seqdiff = self.diff_seq(similar_results[0],
                                                      annotation,
                                                      imgtdb_version)
                    ac_object.differences = len(ac_object.seqdiff)
            else:
                ac_object.hla = "NA"
                ac_object.closest_gfe = "NA"
                if self.verbose:
                    self.logger.warn(self.logname + " No allele call made!")
            return ac_object
def gfeAnnotation_post(sequence, locus, gene=None, imgtdb_version="3.31.0"):
    """gfeAnnotation_post

        Get all kir associated with a GFE # noqa: E501

        :param sequence: Valid sequence fasta
        :param gene: the KIR param true or false
        :param locus: Valid Locus
        :param imgtdb_version:
        :rtype: Typing
        """
    global seqanns

    typing = Typing()
    sequence = SeqRecord(seq=Seq(sequence['sequence']))

    if not re.match(".", imgtdb_version):
        imgtdb_version = ".".join([list(imgtdb_version)[0],
                                    "".join(list(imgtdb_version)[1:3]),
                                   list(imgtdb_version)[3]])

    db = "".join(imgtdb_version.split("."))
    log_capture_string = io.StringIO()
    logger = logging.getLogger('')
    logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p',
                        level=logging.INFO)

    # create console handler and set level to debug
    ch = logging.StreamHandler(log_capture_string)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)-35s - %(levelname)-5s '
        '- %(funcName)s %(lineno)d: - %(message)s')
    ch.setFormatter(formatter)
    ch.setLevel(logging.INFO)
    logger.addHandler(ch)

    # TODO: Use `gene` or locus to figure out the gene-family
    if db in seqanns:
        seqann = seqanns[db]
    elif gene:
        if gene.upper() == 'KIR':
            seqann = BioSeqAnn(verbose=True, safemode=True,
                               dbversion=db, verbosity=3, kir=True)
            seqanns.update({db: seqann})
    else:
        # Defaults to HLA
        seqann = BioSeqAnn(verbose=True, safemode=True,
                           dbversion=db, verbosity=3)
        seqanns.update({db: seqann})

    try:
        annotation = seqann.annotate(sequence, locus)
    except Exception as e:
        print(e)
        log_contents = log_capture_string.getvalue()
        return Error("An error occurred during the annotation",
                     log=log_contents.split("\n")), 404

    if not annotation:
        log_contents = log_capture_string.getvalue()
        return Error("No annotation could be produced",
                     log=log_contents.split("\n")), 404

    if not hasattr(annotation, 'structure'):
        log_contents = log_capture_string.getvalue()
        return Error("No structure was produced",
                     log=log_contents.split("\n")), 404

    feats = []
    for f in annotation.structure:
        fn = Feature(accession=f.accession, rank=f.rank,
                     term=f.term, sequence=f.sequence)
        feats.append(fn)

    typing.features = feats
    typing.gfe = annotation.gfe
    typing.imgtdb_version = imgtdb_version
    return typing
def annotate_get(sequence, locus=None, imgthla_version="3.31.0"):  # noqa: E501
    """annotate_get

    Find the sequence differences between two GFE # noqa: E501

    :param sequence: Valid consensus sequence
    :type sequence: str
    :param locus: Valid locus
    :type locus: str
    :param imgthla_version: IMGT/HLA DB Version
    :type imgthla_version: str
    :param verbose: Flag for running service in verbose
    :type verbose: bool

    :rtype: Typing
    """
    global seqanns

    typing = Typing()
    sequence = SeqRecord(seq=Seq(sequence))

    if not re.match(".", imgthla_version):
        imgthla_version = ".".join([
            list(imgthla_version)[0], "".join(list(imgthla_version)[1:3]),
            list(imgthla_version)[3]
        ])

    db = "".join(imgthla_version.split("."))
    log_capture_string = io.StringIO()
    logger = logging.getLogger('')
    logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)

    # create console handler and set level to debug
    ch = logging.StreamHandler(log_capture_string)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)-35s - %(levelname)-5s - %(funcName)s %(lineno)d: - %(message)s'
    )
    ch.setFormatter(formatter)
    ch.setLevel(logging.INFO)
    logger.addHandler(ch)

    if db in seqanns:
        seqann = seqanns[db]
    else:
        seqann = BioSeqAnn(verbose=True,
                           safemode=True,
                           dbversion=db,
                           verbosity=3)
        seqanns.update({db: seqann})

    try:
        annotation = seqann.annotate(sequence, locus)
    except:
        log_contents = log_capture_string.getvalue()
        return Error("An error occured during the annotation",
                     log=log_contents.split("\n")), 404

    if not annotation:
        log_contents = log_capture_string.getvalue()
        return Error("No annotation could be produced",
                     log=log_contents.split("\n")), 404

    if not hasattr(annotation, 'structure'):
        log_contents = log_capture_string.getvalue()
        return Error("No structure was produced",
                     log=log_contents.split("\n")), 404

    feats = []
    for f in annotation.structure:
        fn = Feature(accession=f.accession,
                     rank=f.rank,
                     term=f.term,
                     sequence=f.sequence)
        feats.append(fn)

    typing.features = feats
    typing.gfe = annotation.gfe
    typing.imgtdb_version = imgthla_version
    return typing