Exemple #1
0
    def __init__(self, url="http://feature.nmdp-bioinformatics.org",
                 loci=['KIR2DP1', 'KIR2DL5A', 'KIR2DS4', 'HLA-DPA1', 'HLA-DQA1', 'HLA-DPB1', 'KIR2DS2', 'KIR3DP1', 'HLA-DRB4', 'KIR2DL1', 'KIR2DS5', 'HLA-DRB3', 'KIR2DS3', 'KIR3DL1', 'HLA-A', 'HLA-DRB5', 'KIR2DL4', 'HLA-DQB1', 'KIR3DL2', 'HLA-B', 'KIR3DS1', 'KIR2DL5B', 'HLA-DRB1', 'KIR3DL3', 'KIR2DS1', 'HLA-C'],
                 load_features=False, store_features=False,
                 cached_features=None,
                 verbose=False,
                 pid="NA",
                 verbosity=0):

        self.loci = loci
        self.verbose = verbose
        self.verbosity = verbosity
        self.store_features = store_features
        self.logger = logging.getLogger("Logger." + __name__)
        self.logname = "ID {:<10} - ".format(str(pid))
        client = ApiClient(host=url)
        api_instance = FeaturesApi(api_client=client)
        self.api = api_instance
        self.all_feats = {loc: {} for loc in loci}
        self.structures = get_structures()
        self.struct_order = get_structorder()

        if cached_features:
            if verbose:
                self.logger.info(self.logname + "Using cached features")
            self.all_feats = cached_features

        # Load all features from feature service
        if load_features and not cached_features:
            if verbose:
                self.logger.info(self.logname + "Loading features...")

            # Calling load_features() to load
            # features at each locus
            self.load_features()
Exemple #2
0
    def __init__(self,
                 server: BioSeqDatabase = None,
                 datafile: str = None,
                 dbversion: str = '3310',
                 alleles: List = None,
                 seqdata: Dict = None,
                 hladata: Dict = None,
                 featuredata=None,
                 kir: bool = False,
                 alignments: bool = False,
                 verbose: bool = False,
                 verbosity: int = 0):
        """
        ReferenceData - a model defined in Swagger
        :param server: The server of this ReferenceData.
        :type server: BioSeqDatabase
        :param datafile: The datafile of this ReferenceData.
        :type datafile: str
        :param dbversion: The dbversion of this ReferenceData.
        :type dbversion: str
        """
        self.data_types = {
            'server': BioSeqDatabase,
            'datafile': str,
            'dbversion': str,
            'hla_names': List[str],
            'feature_lengths': Dict,
            'hlaref': Dict,
            'seqref': Dict,
            'feature_lengths': Dict,
            'structure_max': Dict,
            'struct_order': Dict,
            'structures': Dict,
            'blastdb': str,
            'server_avail': bool,
            'verbose': bool,
            'verbosity': int,
            'alignments': bool
        }

        self.attribute_map = {
            'seqdata': 'seqdata',
            'hlaref': 'hlaref',
            'seqref': 'seqref',
            'server': 'server',
            'datafile': 'datafile',
            'dbversion': 'dbversion',
            'hla_names': 'hla_names',
            'structure_max': 'structure_max',
            'feature_lengths': 'feature_lengths',
            'struct_order': 'struct_order',
            'structures': 'structures',
            'blastdb': 'blastdb',
            'hla_loci': 'hla_loci',
            'server_avail': 'server_avail',
            'kir': 'kir',
            'alignments': 'alignments',
            'verbose': 'verbose',
            'verbosity': 'verbosity'
        }
        self._seqref = {}
        self._hlaref = {}
        self._kir = kir
        self._verbose = verbose
        self._verbosity = verbosity
        self._dbversion = dbversion
        self._server = server
        self._datafile = datafile
        self._alignments = alignments
        self._server_avail = True if server else False

        self.logger = logging.getLogger("Logger." + __name__)

        hla_url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' \
            + dbversion + '/hla.dat'
        kir_url = 'ftp://ftp.ebi.ac.uk/pub/databases/ipd/kir/KIR.dat'
        hla_loci = [
            'HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1', 'HLA-DPB1',
            'HLA-DQA1', 'HLA-DPA1', 'HLA-DRB3', 'HLA-DRB4', 'HLA-DRB5'
        ]

        if self.verbose and verbosity > 0:
            self.logger.info("IPD-IMGT/HLA release = " + str(dbversion))
            self.logger.info("HLA URL = " + hla_url)
            self.logger.info("KIR URL = " + kir_url)
            if self.server_avail:
                self.logger.info("Using BioSQL Server")
                self.logger.info("BIOSQLUSER = "******"BIOSQLHOST = " + biosqlhost)
                self.logger.info("BIOSQLDB = " + biosqldb)
                self.logger.info("BIOSQLPORT = " + str(biosqlport))

        # TODO: ** Have script seqann --setup (--latest|--release|--all)
        #           - downloads and creates all files
        #           - removes all data files except alignment files
        #           - Creates blast db
        #
        # TODO: Download! Don't have in package!
        hla_names = []
        data_dir = os.path.dirname(__file__)
        if kir:
            blastdb = data_dir + '/../data/blast/KIR'
            allele_list = data_dir + '/../data/allele_lists/Allelelist.' \
                                   + 'KIR.txt'
        else:
            blastdb = data_dir + '/../data/blast/' + dbversion
            allele_list = data_dir + '/../data/allele_lists/Allelelist.' \
                                   + dbversion + '.txt'

        if alleles:
            self._hla_names = alleles
        else:
            # Open allele list file
            try:
                with open(allele_list, 'r') as f:
                    for line in f:
                        line = line.rstrip()
                        accession, name = line.split(" ")
                        if not kir:
                            hla_names.append("HLA-" + name)
                        else:
                            hla_names.append(name)
                    f.close()
                if self.verbose and verbosity > 0:
                    self.logger.info("Loaded " + str(len(hla_names)) +
                                     " allele names")
            except OSError as err:
                self.logger.error("OS error: {0}".format(err))
            except:
                self.logger.error("Unexpected error:", sys.exc_info()[0])
                raise
            self._hla_names = hla_names

        #if self.verbose:
        #    mem = "{:4.4f}".format(sys.getsizeof(self.all_feats) / 1000000)
        #    self.logger.info(self.logname + "Finished loading all features * all_feats = " + mem + " MB *")

        feature_lengths = {}
        columns = ['mean', 'std', 'min', 'max']

        featurelength_file = ''
        if kir:
            featurelength_file = data_dir + "/../data/kir-feature_lengths.csv"
        else:
            featurelength_file = data_dir + "/../data/feature_lengths.csv"

        if featuredata:
            self._feature_lengths = featuredata
        else:
            # TODO: use pandas
            try:
                columns = ['mean', 'std', 'min', 'max']
                with open(featurelength_file, newline='') as csvfile:
                    reader = csv.DictReader(csvfile)
                    for row in reader:
                        ldata = [row[c] for c in columns]
                        if row['locus'] in feature_lengths:
                            feature_lengths[row['locus']].update(
                                {row['feature']: ldata})
                        else:
                            feature_lengths.update(
                                {row['locus']: {
                                     row['feature']: ldata
                                 }})
                    csvfile.close()
            except OSError as err:
                self.logger.error("OS error: {0}".format(err))
            except:
                self.logger.error("Unexpected error:", sys.exc_info()[0])
                raise

            self._feature_lengths = feature_lengths

        self._blastdb = blastdb
        self._hla_loci = hla_loci

        self._structures = get_structures()
        self._struct_order = get_structorder()

        self._structure_max = {
            'KIR2DP1': 20,
            'KIR2DL5A': 20,
            'KIR2DS4': 20,
            'HLA-DPA1': 9,
            'HLA-DQA1': 9,
            'KIR2DL2': 20,
            'HLA-DPB1': 11,
            'KIR2DS2': 20,
            'KIR3DP1': 20,
            'HLA-DRB4': 13,
            'KIR2DL1': 20,
            'KIR2DS5': 20,
            'HLA-DRB3': 13,
            'KIR2DS3': 20,
            'KIR3DL1': 20,
            'HLA-A': 17,
            'HLA-DRB5': 13,
            'KIR2DL4': 20,
            'HLA-DQB1': 13,
            'KIR3DL2': 20,
            'HLA-B': 15,
            'KIR3DS1': 20,
            'KIR2DL5B': 20,
            'HLA-DRB1': 13,
            'KIR3DL3': 20,
            'KIR2DS1': 20,
            'HLA-C': 17
        }

        # Starting location of sequence for IPD-IMGT/HLA alignments
        self.location = {
            "HLA-A": -300,
            "HLA-B": -284,
            "HLA-C": -283,
            "HLA-DRB1": -599,
            "HLA-DRB3": -327,
            "HLA-DRB4": -313,
            "HLA-DQB1": -525,
            "HLA-DPB1": -366,
            "HLA-DPA1": -523,
            "HLA-DQA1": -746
        }

        self.align_coordinates = {}
        self.annoated_alignments = {}
        if alignments:
            pickle_dir = data_dir + '/../data/alignments/' + dbversion
            pickle_files = glob.glob(pickle_dir + '/*.pickle')
            for pickle_file in pickle_files:
                locus = pickle_file.split("/")[len(pickle_file.split("/")) -
                                               1].split(".")[0].split("_")[0]
                if self.verbose:
                    self.logger.info("Loading " + pickle_file)
                with open(pickle_file, 'rb') as handle:
                    self.annoated_alignments.update(
                        {locus: pickle.load(handle)})
                    handle.close()
                allele = list(self.annoated_alignments[locus].keys())[0]
                if not locus in self.align_coordinates and "HLA-" + locus in self.struct_order:
                    start = 0
                    feat_order = list(self.struct_order["HLA-" + locus].keys())
                    feat_order.sort()
                    self.align_coordinates.update({locus: {}})
                    if self.verbose and self.verbosity > 2:
                        self.logger.info("* Alignment coordinates *")
                    for i in feat_order:
                        feat = self.struct_order["HLA-" + locus][i]
                        seq = self.annoated_alignments[locus][allele][feat][
                            'Seq']
                        end = start + len(seq)
                        if self.verbose and self.verbosity > 2:
                            self.logger.info(feat + " start = " + str(start) +
                                             " | end = " + str(end))
                        for j in range(start, end):
                            self.align_coordinates[locus].update({j: feat})
                        start = end

        # If no server is provided
        # download the dat file
        if seqdata and hladata:
            self._hlaref = hladata
            self._seqref = seqdata
        elif not self._server_avail:
            if kir:
                datfile = data_dir + '/../data/KIR.dat'
            else:
                datfile = data_dir + '/../data/' + dbversion + '.hla.dat'

            if not os.path.isfile(datfile) and not kir:
                if self.verbose:
                    self.logger.info("Downloding KIR data file - " + datfile)
                download_dat(hla_url, datfile)
            elif not os.path.isfile(datfile) and kir:
                if self.verbose:
                    self.logger.info("Downloding HLA data file - " + datfile)
                download_dat(kir_url, datfile)

            # Load HLA dat file
            seqref_pickle = data_dir \
                + '/../data/seqref.' + dbversion + ".pickle"

            hlaref_pickle = data_dir \
                + '/../data/hlaref.' + dbversion + ".pickle"

            if not os.path.isfile(seqref_pickle) or \
                    not os.path.isfile(hlaref_pickle):

                hladata = SeqIO.parse(datfile, "imgt")
                for seqrec in hladata:
                    seqname = seqrec.description.split(",")[0]
                    locus = seqname.split("*")[0]
                    if locus in self.structure_max:
                        self._hlaref.update({seqname: seqrec})
                        self._seqref.update({str(seqrec.seq): seqname})

                if self.verbose:
                    self.logger.info("Finished loading dat file")
                    self.logger.info("Writing pickle of dat file")

                with open(seqref_pickle, 'wb') as handle:
                    pickle.dump(self._seqref,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                    handle.close()
                with open(hlaref_pickle, 'wb') as handle:
                    pickle.dump(self._hlaref,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                    handle.close()
            else:
                if self.verbose:
                    self.logger.info("Loading pickle dat file")
                with open(seqref_pickle, 'rb') as handle:
                    self._seqref = pickle.load(handle)
                    handle.close()
                with open(hlaref_pickle, 'rb') as handle:
                    self._hlaref = pickle.load(handle)
                    handle.close()
Exemple #3
0
def resolve_feats(feat_list,
                  seqin,
                  seqref,
                  start,
                  locus,
                  missing,
                  verbose=False,
                  verbosity=0):
    """
    resolve_feats - Resolves features from alignments

    :param feat_list: List of the found features
    :type feat_list: ``List``
    :param seqin: The input sequence
    :type seqin: ``str``
    :param locus: The input locus
    :type locus: ``str``
    :param start: Where the sequence start in the alignment
    :type start: ``int``
    :param missing: List of the unmapped features
    :type missing: ``List``
    :param verbose: Flag for running in verbose mode.
    :type verbose: ``bool``
    :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode.
    :type verbosity: ``int``
    :rtype: :ref:`ann`
    """
    structures = get_structures()
    logger = logging.getLogger("Logger." + __name__)
    seq = SeqRecord(seq=Seq("".join(seqin), SingleLetterAlphabet()))
    seq_covered = len(seq.seq)
    coordinates = dict(
        map(lambda x: [x, 1], [i for i in range(0,
                                                len(seq.seq) + 1)]))

    mapping = dict(
        map(lambda x: [x, 1], [i for i in range(0,
                                                len(seq.seq) + 1)]))

    diff = 0
    if len(feat_list) > 1:
        if verbose:
            logger.error("resolve_feats error")
        return Annotation(complete_annotation=False)
    else:
        features = {}
        full_annotation = {}
        features = feat_list[0]

        # Need to sort
        feature_list = sorted(features.keys(),
                              key=lambda f: structures[locus][f])

        diff_f = True
        for feat in feature_list:
            if feat in missing:
                f = features[feat]
                seqrec = f.extract(seq)
                seq_covered -= len(seqrec.seq)
                if re.search("-", str(seqrec.seq)):
                    l1 = len(seqrec.seq)
                    newseq = re.sub(r'-', '', str(seqrec.seq))
                    seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna)
                    tmdiff = l1 - len(newseq)
                    diff += tmdiff

                if seqrec.seq:
                    #logger.error("FEAT HAS SEQ " + feat)
                    if diff_f and diff > 0:
                        sp = f.location.start + start
                        diff_f = False
                    else:
                        sp = f.location.start + start - diff

                    ep = f.location.end + start - diff
                    featn = SeqFeature(FeatureLocation(ExactPosition(sp),
                                                       ExactPosition(ep),
                                                       strand=1),
                                       type=f.type)

                    features.update({feat: featn})
                    full_annotation.update({feat: seqrec})

                    for i in range(featn.location.start, featn.location.end):
                        if i in coordinates:
                            del coordinates[i]
                        mapping[i] = feat
            else:
                f = features[feat]
                seqrec = f.extract(seq)
                seq_covered -= len(seqrec.seq)
                if re.search("-", str(seqrec.seq)):
                    l1 = len(seqrec.seq)
                    newseq = re.sub(r'-', '', str(seqrec.seq))
                    seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna)
                    tmdiff = l1 - len(newseq)
                    diff += tmdiff

        blocks = getblocks(coordinates)
        rmapping = {k + start: mapping[k] for k in mapping.keys()}

        # Print out what features are missing
        if verbose and verbosity > 0 and len(full_annotation.keys()) > 1:
            logger.info("Features resolved:")
            for f in full_annotation:
                logger.info(f)
        else:
            if verbose:
                logger.info("Failed to resolve")

        if not full_annotation or len(full_annotation) == 0:
            if verbose:
                logger.info("Failed to align missing features")
            return Annotation(complete_annotation=False)
        else:
            return Annotation(annotation=full_annotation,
                              method="clustalo",
                              features=features,
                              mapping=rmapping,
                              blocks=blocks,
                              seq=seq)
Exemple #4
0
    def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None):
        """
        search_seqs - method for annotating a BioPython sequence without alignment

        :param seqrec: The reference sequence
        :type seqrec: SeqRecord
        :param locus: The gene locus associated with the sequence.
        :type locus: str
        :param in_seq: The input sequence
        :type in_seq: SeqRecord
        :param run: The number of runs that have been done
        :type run: int
        :param partial_ann: A partial annotation from a previous step
        :type partial_ann: :ref:`ann`
        :rtype: :ref:`ann`

        Example usage:

            >>> from Bio.Seq import Seq
            >>> from seqann.seq_search import SeqSearch
            >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC')
            >>> sqsrch = SeqSearch()
            >>> ann = sqsrch.search_seqs(refseqs, inseq)

        """
        # Extract out the sequences and feature names
        # from the reference sequences

        # The mapped features will be subtracted from seq_covered
        # so the final seq_covered number will reflect the remaining
        # number of base pairs that haven't been mapped.
        #
        # The coordinates and mapping will help determine what positions
        # in the sequence have been mapped and to what features. The
        # missing blocks variable will be generated using these.
        structures = get_structures()
        seq_covered = len(in_seq.seq)
        coordinates = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        mapping = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        ambig_map = {}
        found_feats = {}
        feat_missing = {}

        method = "nt_search" if not partial_ann else partial_ann.method

        # If the partial annotation is provided
        # then make the found_feats equal to
        # what has already been annotated
        feats = get_features(seqrec)
        if partial_ann:

            found_feats = partial_ann.features

            if self.verbose and self.verbosity > 4:
                self.logger.info("Found partial features:")
                for f in found_feats:
                    self.logger.info(f)

            # Skip references that only have features
            # that have already been annoated
            if len([f for f in feats if f in found_feats]) == len(feats):
                if self.verbose:
                    self.logger.info("Skipping incomplete refseq")
                return partial_ann

            if self.verbose and self.verbosity > 1:
                self.logger.info("Using partial annotation | " + locus + " " +
                                 str(len(partial_ann.features)))

            coordinates = dict(
                map(lambda l: [l, 1], [
                    item for sublist in partial_ann.blocks for item in sublist
                ]))
            seq_covered = partial_ann.covered
            mapping = partial_ann.mapping

            if self.verbose and self.verbosity > 2:
                self.logger.info("Partial sequence coverage = " +
                                 str(seq_covered))
                self.logger.info("Partial sequence metho = " + method)

        added_feat = {}
        deleted_coords = {}
        for feat_name in sorted(feats, key=lambda k: structures[locus][k]):

            # skip if partial annotation is provided
            # and the feat name is not one of the
            # missing features
            if partial_ann and feat_name not in partial_ann.refmissing:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Skipping " + feat_name +
                                     " - Already annotated")
                continue

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running seqsearch for " + feat_name)

            # Search for the reference feature sequence in the
            # input sequence. Record the coordinates if it's
            # found and if it's found in multiple spots. If it
            # is not found, then record that feature as missing.
            seq_search = nt_search(str(in_seq.seq), str(feats[feat_name]))

            if len(seq_search) == 2:

                if self.verbose and self.verbosity > 0:
                    self.logger.info("Found exact match for " + feat_name)

                seq_covered -= len(str(feats[feat_name]))
                end = int(len(str(feats[feat_name])) + seq_search[1])

                if feat_name == 'three_prime_UTR' \
                        and len(str(in_seq.seq)) > end:
                    end = len(str(in_seq.seq))

                # If the feature is found and it's a five_prime_UTR then
                # the start should always be 0, so insertions at the
                # beinging of the sequence will be found.
                start = seq_search[1] if feat_name != 'five_prime_UTR' else 0
                si = seq_search[1]+1 if seq_search[1] != 0 and \
                    feat_name != 'five_prime_UTR' else 0

                # check if this features has already been mapped
                mapcheck = set(
                    [0 if i in coordinates else 1 for i in range(si, end + 1)])

                # Dont map features if they are out of order
                skip = False
                if found_feats and len(found_feats) > 0:
                    for f in found_feats:
                        o1 = structures[locus][feat_name]
                        o2 = structures[locus][f]
                        loctyp = loctype(found_feats[f].location.start,
                                         found_feats[f].location.end, start,
                                         end)

                        if o1 < o2 and loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)
                        elif o2 < o1 and not loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)

                if 1 not in mapcheck and not skip:
                    for i in range(si, end + 1):
                        if i in coordinates:
                            if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                                deleted_coords.update({i: coordinates[i]})
                            del coordinates[i]
                        else:
                            if self.verbose:
                                self.logger.error(
                                    "seqsearch - should't be here " + locus +
                                    " - " + " - " + feat_name)
                        mapping[i] = feat_name

                    found_feats.update({
                        feat_name:
                        SeqFeature(FeatureLocation(ExactPosition(start),
                                                   ExactPosition(end),
                                                   strand=1),
                                   type=feat_name)
                    })

                    if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                        added_feat.update({feat_name: feats[feat_name]})
                    if self.verbose and self.verbosity > 3:
                        self.logger.info("Coordinates | Start = " +
                                         str(start) + " - End = " + str(end))

            elif (len(seq_search) > 2):
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Found " + str(len(seq_search)) +
                                     " matches for " + feat_name)

                new_seq = [seq_search[0]]
                for i in range(1, len(seq_search)):
                    tnp = seq_search[i] + 1
                    if seq_search[i] in coordinates or tnp in coordinates:
                        new_seq.append(seq_search[i])

                seq_search = new_seq
                if (partial_ann and feat_name == "exon_8" and run > 0):
                    missing_feats = sorted(list(partial_ann.missing.keys()))

                    # * HARD CODED LOGIC * #
                    # > exon8 in class I maps to multiple spots in a sequence,
                    #   often in the 3' UTR. These features need to be mapped
                    #   last to make sure it's not mapping exon8 incorrectly.
                    if (missing_feats == ['exon_8', 'three_prime_UTR']
                            and len(seq_search) <= 3):
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Resolving exon_8")

                        seq_covered -= len(str(feats[feat_name]))
                        end = int(len(str(feats[feat_name])) + seq_search[1])

                        # If the feature is found and it's a five_prime_UTR then
                        # the start should always be 0, so insertions at the
                        # beinging of the sequence will be found.
                        start = seq_search[1]
                        si = seq_search[1] + 1 if seq_search[1] != 0 else 0

                        # check if this features has already been mapped
                        mapcheck = set([
                            0 if i in coordinates else 1
                            for i in range(si, end + 1)
                        ])

                        for i in range(si, end + 1):
                            if i in coordinates:
                                del coordinates[i]
                            else:
                                if self.verbose:
                                    self.logger.error(
                                        "seqsearch - should't be here " +
                                        locus + " - " + " - " + feat_name)
                            mapping[i] = feat_name

                        found_feats.update({
                            feat_name:
                            SeqFeature(FeatureLocation(ExactPosition(start),
                                                       ExactPosition(end),
                                                       strand=1),
                                       type=feat_name)
                        })

                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Coordinates | Start = " +
                                             str(start) + " - End = " +
                                             str(end))
                    else:
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Adding ambig feature " +
                                             feat_name)
                        feat_missing.update({feat_name: feats[feat_name]})
                        ambig_map.update(
                            {feat_name: seq_search[1:len(seq_search)]})
                else:
                    if self.verbose and self.verbosity > 0:
                        self.logger.info("Adding ambig feature " + feat_name)
                    feat_missing.update({feat_name: feats[feat_name]})
                    ambig_map.update(
                        {feat_name: seq_search[1:len(seq_search)]})
            else:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("No match for " + feat_name)
                feat_missing.update({feat_name: feats[feat_name]})

        blocks = getblocks(coordinates)
        exact_matches = list(found_feats.keys())

        # * HARD CODED LOGIC * #
        # >
        #
        #  HLA-DRB1 exon3 exact match - with intron1 and 3 missing
        if ('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1'
                and 'exon_2' in feat_missing
                and (len(blocks) == 1 or len(blocks) == 2)):

            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    featname = "intron_3"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                else:
                    featname = "exon_2"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class DRB1 II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        # If it's a class II sequence and
        # exon_2 is an exact match
        # * HARD CODED LOGIC * #
        # > It's common for exon2 to be fully sequenced
        #   but intron_2 and intron_1 to be partially sequenced,
        #   which can make it hard to annotate those to features.
        #   If there are two missing blocks that is small enough
        #   and they are before and after exon2, then it's very
        #   very likely to be intron_2 and intron_1.
        if 'exon_2' in exact_matches and len(blocks) == 2 \
                and is_classII(locus) and seq_covered < 300:

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running search for class II sequence")

            r = True
            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    x = b[0] - 1
                else:
                    x += 1
                f = mapping[x]
                if f != 'exon_2':
                    r = False
            if r:
                for b in blocks:
                    x = b[len(b) - 1]
                    if x == max(list(mapping.keys())):
                        featname = "intron_2"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    else:
                        featname = "intron_1"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        annotated_feats, mb, mapping = self._resolve_unmapped(
            blocks, feat_missing, ambig_map, mapping, found_feats, locus,
            seq_covered)

        # * HARD CODED LOGIC * #
        if (not mb and blocks and len(feat_missing.keys()) == 0
                and len(ambig_map.keys()) == 0):
            mb = blocks

        if mb:

            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 3000 \
                    and 'exon_8' in exact_matches:
                for i in deleted_coords:
                    mapping[i] = 1
                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)

                # Delte from found features
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']

                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            refmissing = [
                f for f in structures[locus] if f not in annotated_feats
            ]

            if self.verbose and self.verbosity > 1:
                self.logger.info("* Annotation not complete *")

            # Print out what features were missing by the ref
            if self.verbose and self.verbosity > 2:
                self.logger.info("Refseq was missing these features = " +
                                 ",".join(list(refmissing)))

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 1 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 2 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 1 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 1 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    blocks=mb,
                                    method=method,
                                    refmissing=refmissing,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)
        else:

            mb = None
            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 600 \
                    and 'exon_8' in exact_matches \
                    and 'three_prime_UTR' in annotated_feats\
                    and 'three_prime_UTR' not in exact_matches:

                for i in deleted_coords:
                    mapping[i] = 1

                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']
                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            if self.verbose:
                self.logger.info("* No missing blocks after seq_search *")

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 0 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 0 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 0 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 0 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    method=method,
                                    blocks=mb,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)

        return annotation
Exemple #5
0
    def _resolve_unmapped(self,
                          blocks,
                          feat_missing,
                          ambig_map,
                          mapping,
                          found_feats,
                          loc,
                          covered,
                          rerun=False):

        structures = get_structures()
        struct_order = get_structorder()
        structure_max = get_structmax()
        exon_only = True
        found_exons = 0
        for f in found_feats:
            if re.search("intron", f) or re.search("UTR", f):
                exon_only = False

            if re.search("exon", f):
                found_exons += 1

        # Count the number of exons for the given loci
        num_exons = 0
        for f in structures[loc]:
            if re.search("exon", f):
                num_exons += 1

        # If all exons have been mapped
        # then it is not exon only data
        if found_exons == num_exons:
            exon_only = False

        # If it's exon only, then search two
        # features up rather than one
        add_num = 2 if exon_only and rerun and covered < 300 else 1

        block_mapped = []
        missing_blocks = []
        for b in blocks:
            for featname in ambig_map.keys():
                locats = ambig_map[featname]
                start_i = b[0] - 1
                end_i = b[len(b) - 1] + 1

                # TODO: Catch ERROR
                #if not end_i in mapping:
                feat_num = structures[loc][featname]
                x = feat_num - add_num
                y = feat_num - add_num
                if feat_num+add_num <= structure_max[loc] \
                        and feat_num-add_num >= 0 and start_i >= 0 \
                        and end_i <= len(mapping) - 1 \
                        and x in struct_order[loc] \
                        and y in struct_order[loc]:
                    expected_p = struct_order[loc][feat_num - add_num]
                    expected_n = struct_order[loc][feat_num + add_num]
                    previous_feat = mapping[start_i]
                    next_feat = mapping[end_i]
                    if expected_p == previous_feat \
                        and expected_n == next_feat \
                        and expected_p != 1 \
                            and b[0]-1 in locats:
                        block_mapped.append(b)
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                elif feat_num+add_num > structure_max[loc] \
                        and feat_num-add_num >= 0 and start_i >= 0 \
                        and end_i >= max(mapping) \
                        and y in struct_order[loc]:
                    expected_p = struct_order[loc][feat_num - add_num]
                    previous_feat = mapping[start_i]
                    if expected_p == previous_feat \
                        and expected_p != 1 \
                            and b[0]-1 in locats:
                        block_mapped.append(b)
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                elif feat_num+add_num <= structure_max[loc] \
                        and feat_num-add_num < 0\
                        and x in struct_order[loc]:
                    expected_n = struct_order[loc][feat_num + add_num]
                    if not end_i in mapping:
                        next_feat = mapping[end_i - 1]
                    else:
                        next_feat = mapping[end_i]
                    if expected_n == next_feat \
                        and expected_p != 1 \
                            and b[0]-1 in locats:
                        block_mapped.append(b)
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                else:
                    missing_blocks.append(b)

        for b in blocks:
            for featname in feat_missing.keys():
                if featname in ambig_map and b not in block_mapped:
                    if b not in missing_blocks:
                        missing_blocks.append(b)
                    continue

                if b not in block_mapped:
                    #featlen = feat_missing[featname]
                    start_i = b[0] - 1
                    end_i = b[len(b) - 1] + 1
                    feat_num = structures[loc][featname]

                    if feat_num+add_num <= structure_max[loc] \
                        and feat_num-1 >= 1 \
                            and end_i <= max(mapping.keys()) \
                            and start_i >= 0 \
                            and feat_num-add_num > 0:

                        expected_p = struct_order[loc][feat_num - add_num]
                        expected_n = struct_order[loc][feat_num + add_num]
                        previous_feat = mapping[start_i]
                        next_feat = mapping[end_i]
                        if expected_p == previous_feat \
                            and expected_n == next_feat \
                                and expected_p != 1 \
                                and expected_n != 1:
                            if b in missing_blocks:
                                del missing_blocks[missing_blocks.index(b)]
                            for i in b:
                                mapping.update({i: featname})
                            block_mapped.append(b)
                            found_feats.update({
                                featname:
                                SeqFeature(FeatureLocation(
                                    ExactPosition(b[0] - 1),
                                    ExactPosition(b[len(b) - 1]),
                                    strand=1),
                                           type=featname)
                            })
                        else:
                            if b not in missing_blocks:
                                missing_blocks.append(b)
                    elif feat_num+add_num > structure_max[loc] \
                            and feat_num-add_num >= 1 and start_i >= 0:
                        expected_p = struct_order[loc][feat_num - add_num]
                        previous_feat = mapping[start_i]
                        if expected_p == previous_feat \
                                and expected_p != 1:
                            if b in missing_blocks:
                                del missing_blocks[missing_blocks.index(b)]
                            block_mapped.append(b)
                            for i in b:
                                mapping.update({i: featname})
                            found_feats.update({
                                featname:
                                SeqFeature(FeatureLocation(
                                    ExactPosition(b[0] - 1),
                                    ExactPosition(b[len(b) - 1]),
                                    strand=1),
                                           type=featname)
                            })
                        else:
                            if b not in missing_blocks:
                                missing_blocks.append(b)
                    elif (feat_num + add_num <= structure_max[loc]
                          and feat_num - add_num < 1 and end_i <= max(mapping)
                          and end_i in mapping):
                        expected_n = struct_order[loc][feat_num + add_num]
                        next_feat = mapping[end_i]
                        if expected_n == next_feat:
                            if b in missing_blocks:
                                del missing_blocks[missing_blocks.index(b)]
                            add = 0
                            block_mapped.append(b)
                            for i in b:
                                mapping.update({i: featname})
                            if add != 0:
                                for i in range(b[len(b) - 1],
                                               b[len(b) - 1] + add):
                                    mapping.update({i: featname})
                            found_feats.update({
                                featname:
                                SeqFeature(FeatureLocation(
                                    ExactPosition(b[0]),
                                    ExactPosition(b[len(b) - 1] + add),
                                    strand=1),
                                           type=featname)
                            })
                        else:
                            if b not in missing_blocks:
                                missing_blocks.append(b)
                    else:
                        if b not in missing_blocks:
                            missing_blocks.append(b)

        # If it failed to map all features when only looking
        # at the exons, then try again and look at all features
        if exon_only and not rerun and missing_blocks:
            if self.verbose:
                self.logger.info("Rerunning seqsearch to look at all features")
            return self._resolve_unmapped(missing_blocks,
                                          feat_missing,
                                          ambig_map,
                                          mapping,
                                          found_feats,
                                          loc,
                                          covered,
                                          rerun=True)
        else:
            return found_feats, missing_blocks, mapping