def __init__(self, url="http://feature.nmdp-bioinformatics.org", loci=['KIR2DP1', 'KIR2DL5A', 'KIR2DS4', 'HLA-DPA1', 'HLA-DQA1', 'HLA-DPB1', 'KIR2DS2', 'KIR3DP1', 'HLA-DRB4', 'KIR2DL1', 'KIR2DS5', 'HLA-DRB3', 'KIR2DS3', 'KIR3DL1', 'HLA-A', 'HLA-DRB5', 'KIR2DL4', 'HLA-DQB1', 'KIR3DL2', 'HLA-B', 'KIR3DS1', 'KIR2DL5B', 'HLA-DRB1', 'KIR3DL3', 'KIR2DS1', 'HLA-C'], load_features=False, store_features=False, cached_features=None, verbose=False, pid="NA", verbosity=0): self.loci = loci self.verbose = verbose self.verbosity = verbosity self.store_features = store_features self.logger = logging.getLogger("Logger." + __name__) self.logname = "ID {:<10} - ".format(str(pid)) client = ApiClient(host=url) api_instance = FeaturesApi(api_client=client) self.api = api_instance self.all_feats = {loc: {} for loc in loci} self.structures = get_structures() self.struct_order = get_structorder() if cached_features: if verbose: self.logger.info(self.logname + "Using cached features") self.all_feats = cached_features # Load all features from feature service if load_features and not cached_features: if verbose: self.logger.info(self.logname + "Loading features...") # Calling load_features() to load # features at each locus self.load_features()
def __init__(self, server: BioSeqDatabase = None, datafile: str = None, dbversion: str = '3310', alleles: List = None, seqdata: Dict = None, hladata: Dict = None, featuredata=None, kir: bool = False, alignments: bool = False, verbose: bool = False, verbosity: int = 0): """ ReferenceData - a model defined in Swagger :param server: The server of this ReferenceData. :type server: BioSeqDatabase :param datafile: The datafile of this ReferenceData. :type datafile: str :param dbversion: The dbversion of this ReferenceData. :type dbversion: str """ self.data_types = { 'server': BioSeqDatabase, 'datafile': str, 'dbversion': str, 'hla_names': List[str], 'feature_lengths': Dict, 'hlaref': Dict, 'seqref': Dict, 'feature_lengths': Dict, 'structure_max': Dict, 'struct_order': Dict, 'structures': Dict, 'blastdb': str, 'server_avail': bool, 'verbose': bool, 'verbosity': int, 'alignments': bool } self.attribute_map = { 'seqdata': 'seqdata', 'hlaref': 'hlaref', 'seqref': 'seqref', 'server': 'server', 'datafile': 'datafile', 'dbversion': 'dbversion', 'hla_names': 'hla_names', 'structure_max': 'structure_max', 'feature_lengths': 'feature_lengths', 'struct_order': 'struct_order', 'structures': 'structures', 'blastdb': 'blastdb', 'hla_loci': 'hla_loci', 'server_avail': 'server_avail', 'kir': 'kir', 'alignments': 'alignments', 'verbose': 'verbose', 'verbosity': 'verbosity' } self._seqref = {} self._hlaref = {} self._kir = kir self._verbose = verbose self._verbosity = verbosity self._dbversion = dbversion self._server = server self._datafile = datafile self._alignments = alignments self._server_avail = True if server else False self.logger = logging.getLogger("Logger." + __name__) hla_url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' \ + dbversion + '/hla.dat' kir_url = 'ftp://ftp.ebi.ac.uk/pub/databases/ipd/kir/KIR.dat' hla_loci = [ 'HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DPA1', 'HLA-DRB3', 'HLA-DRB4', 'HLA-DRB5' ] if self.verbose and verbosity > 0: self.logger.info("IPD-IMGT/HLA release = " + str(dbversion)) self.logger.info("HLA URL = " + hla_url) self.logger.info("KIR URL = " + kir_url) if self.server_avail: self.logger.info("Using BioSQL Server") self.logger.info("BIOSQLUSER = "******"BIOSQLHOST = " + biosqlhost) self.logger.info("BIOSQLDB = " + biosqldb) self.logger.info("BIOSQLPORT = " + str(biosqlport)) # TODO: ** Have script seqann --setup (--latest|--release|--all) # - downloads and creates all files # - removes all data files except alignment files # - Creates blast db # # TODO: Download! Don't have in package! hla_names = [] data_dir = os.path.dirname(__file__) if kir: blastdb = data_dir + '/../data/blast/KIR' allele_list = data_dir + '/../data/allele_lists/Allelelist.' \ + 'KIR.txt' else: blastdb = data_dir + '/../data/blast/' + dbversion allele_list = data_dir + '/../data/allele_lists/Allelelist.' \ + dbversion + '.txt' if alleles: self._hla_names = alleles else: # Open allele list file try: with open(allele_list, 'r') as f: for line in f: line = line.rstrip() accession, name = line.split(" ") if not kir: hla_names.append("HLA-" + name) else: hla_names.append(name) f.close() if self.verbose and verbosity > 0: self.logger.info("Loaded " + str(len(hla_names)) + " allele names") except OSError as err: self.logger.error("OS error: {0}".format(err)) except: self.logger.error("Unexpected error:", sys.exc_info()[0]) raise self._hla_names = hla_names #if self.verbose: # mem = "{:4.4f}".format(sys.getsizeof(self.all_feats) / 1000000) # self.logger.info(self.logname + "Finished loading all features * all_feats = " + mem + " MB *") feature_lengths = {} columns = ['mean', 'std', 'min', 'max'] featurelength_file = '' if kir: featurelength_file = data_dir + "/../data/kir-feature_lengths.csv" else: featurelength_file = data_dir + "/../data/feature_lengths.csv" if featuredata: self._feature_lengths = featuredata else: # TODO: use pandas try: columns = ['mean', 'std', 'min', 'max'] with open(featurelength_file, newline='') as csvfile: reader = csv.DictReader(csvfile) for row in reader: ldata = [row[c] for c in columns] if row['locus'] in feature_lengths: feature_lengths[row['locus']].update( {row['feature']: ldata}) else: feature_lengths.update( {row['locus']: { row['feature']: ldata }}) csvfile.close() except OSError as err: self.logger.error("OS error: {0}".format(err)) except: self.logger.error("Unexpected error:", sys.exc_info()[0]) raise self._feature_lengths = feature_lengths self._blastdb = blastdb self._hla_loci = hla_loci self._structures = get_structures() self._struct_order = get_structorder() self._structure_max = { 'KIR2DP1': 20, 'KIR2DL5A': 20, 'KIR2DS4': 20, 'HLA-DPA1': 9, 'HLA-DQA1': 9, 'KIR2DL2': 20, 'HLA-DPB1': 11, 'KIR2DS2': 20, 'KIR3DP1': 20, 'HLA-DRB4': 13, 'KIR2DL1': 20, 'KIR2DS5': 20, 'HLA-DRB3': 13, 'KIR2DS3': 20, 'KIR3DL1': 20, 'HLA-A': 17, 'HLA-DRB5': 13, 'KIR2DL4': 20, 'HLA-DQB1': 13, 'KIR3DL2': 20, 'HLA-B': 15, 'KIR3DS1': 20, 'KIR2DL5B': 20, 'HLA-DRB1': 13, 'KIR3DL3': 20, 'KIR2DS1': 20, 'HLA-C': 17 } # Starting location of sequence for IPD-IMGT/HLA alignments self.location = { "HLA-A": -300, "HLA-B": -284, "HLA-C": -283, "HLA-DRB1": -599, "HLA-DRB3": -327, "HLA-DRB4": -313, "HLA-DQB1": -525, "HLA-DPB1": -366, "HLA-DPA1": -523, "HLA-DQA1": -746 } self.align_coordinates = {} self.annoated_alignments = {} if alignments: pickle_dir = data_dir + '/../data/alignments/' + dbversion pickle_files = glob.glob(pickle_dir + '/*.pickle') for pickle_file in pickle_files: locus = pickle_file.split("/")[len(pickle_file.split("/")) - 1].split(".")[0].split("_")[0] if self.verbose: self.logger.info("Loading " + pickle_file) with open(pickle_file, 'rb') as handle: self.annoated_alignments.update( {locus: pickle.load(handle)}) handle.close() allele = list(self.annoated_alignments[locus].keys())[0] if not locus in self.align_coordinates and "HLA-" + locus in self.struct_order: start = 0 feat_order = list(self.struct_order["HLA-" + locus].keys()) feat_order.sort() self.align_coordinates.update({locus: {}}) if self.verbose and self.verbosity > 2: self.logger.info("* Alignment coordinates *") for i in feat_order: feat = self.struct_order["HLA-" + locus][i] seq = self.annoated_alignments[locus][allele][feat][ 'Seq'] end = start + len(seq) if self.verbose and self.verbosity > 2: self.logger.info(feat + " start = " + str(start) + " | end = " + str(end)) for j in range(start, end): self.align_coordinates[locus].update({j: feat}) start = end # If no server is provided # download the dat file if seqdata and hladata: self._hlaref = hladata self._seqref = seqdata elif not self._server_avail: if kir: datfile = data_dir + '/../data/KIR.dat' else: datfile = data_dir + '/../data/' + dbversion + '.hla.dat' if not os.path.isfile(datfile) and not kir: if self.verbose: self.logger.info("Downloding KIR data file - " + datfile) download_dat(hla_url, datfile) elif not os.path.isfile(datfile) and kir: if self.verbose: self.logger.info("Downloding HLA data file - " + datfile) download_dat(kir_url, datfile) # Load HLA dat file seqref_pickle = data_dir \ + '/../data/seqref.' + dbversion + ".pickle" hlaref_pickle = data_dir \ + '/../data/hlaref.' + dbversion + ".pickle" if not os.path.isfile(seqref_pickle) or \ not os.path.isfile(hlaref_pickle): hladata = SeqIO.parse(datfile, "imgt") for seqrec in hladata: seqname = seqrec.description.split(",")[0] locus = seqname.split("*")[0] if locus in self.structure_max: self._hlaref.update({seqname: seqrec}) self._seqref.update({str(seqrec.seq): seqname}) if self.verbose: self.logger.info("Finished loading dat file") self.logger.info("Writing pickle of dat file") with open(seqref_pickle, 'wb') as handle: pickle.dump(self._seqref, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() with open(hlaref_pickle, 'wb') as handle: pickle.dump(self._hlaref, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() else: if self.verbose: self.logger.info("Loading pickle dat file") with open(seqref_pickle, 'rb') as handle: self._seqref = pickle.load(handle) handle.close() with open(hlaref_pickle, 'rb') as handle: self._hlaref = pickle.load(handle) handle.close()
def resolve_feats(feat_list, seqin, seqref, start, locus, missing, verbose=False, verbosity=0): """ resolve_feats - Resolves features from alignments :param feat_list: List of the found features :type feat_list: ``List`` :param seqin: The input sequence :type seqin: ``str`` :param locus: The input locus :type locus: ``str`` :param start: Where the sequence start in the alignment :type start: ``int`` :param missing: List of the unmapped features :type missing: ``List`` :param verbose: Flag for running in verbose mode. :type verbose: ``bool`` :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. :type verbosity: ``int`` :rtype: :ref:`ann` """ structures = get_structures() logger = logging.getLogger("Logger." + __name__) seq = SeqRecord(seq=Seq("".join(seqin), SingleLetterAlphabet())) seq_covered = len(seq.seq) coordinates = dict( map(lambda x: [x, 1], [i for i in range(0, len(seq.seq) + 1)])) mapping = dict( map(lambda x: [x, 1], [i for i in range(0, len(seq.seq) + 1)])) diff = 0 if len(feat_list) > 1: if verbose: logger.error("resolve_feats error") return Annotation(complete_annotation=False) else: features = {} full_annotation = {} features = feat_list[0] # Need to sort feature_list = sorted(features.keys(), key=lambda f: structures[locus][f]) diff_f = True for feat in feature_list: if feat in missing: f = features[feat] seqrec = f.extract(seq) seq_covered -= len(seqrec.seq) if re.search("-", str(seqrec.seq)): l1 = len(seqrec.seq) newseq = re.sub(r'-', '', str(seqrec.seq)) seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna) tmdiff = l1 - len(newseq) diff += tmdiff if seqrec.seq: #logger.error("FEAT HAS SEQ " + feat) if diff_f and diff > 0: sp = f.location.start + start diff_f = False else: sp = f.location.start + start - diff ep = f.location.end + start - diff featn = SeqFeature(FeatureLocation(ExactPosition(sp), ExactPosition(ep), strand=1), type=f.type) features.update({feat: featn}) full_annotation.update({feat: seqrec}) for i in range(featn.location.start, featn.location.end): if i in coordinates: del coordinates[i] mapping[i] = feat else: f = features[feat] seqrec = f.extract(seq) seq_covered -= len(seqrec.seq) if re.search("-", str(seqrec.seq)): l1 = len(seqrec.seq) newseq = re.sub(r'-', '', str(seqrec.seq)) seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna) tmdiff = l1 - len(newseq) diff += tmdiff blocks = getblocks(coordinates) rmapping = {k + start: mapping[k] for k in mapping.keys()} # Print out what features are missing if verbose and verbosity > 0 and len(full_annotation.keys()) > 1: logger.info("Features resolved:") for f in full_annotation: logger.info(f) else: if verbose: logger.info("Failed to resolve") if not full_annotation or len(full_annotation) == 0: if verbose: logger.info("Failed to align missing features") return Annotation(complete_annotation=False) else: return Annotation(annotation=full_annotation, method="clustalo", features=features, mapping=rmapping, blocks=blocks, seq=seq)
def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): """ search_seqs - method for annotating a BioPython sequence without alignment :param seqrec: The reference sequence :type seqrec: SeqRecord :param locus: The gene locus associated with the sequence. :type locus: str :param in_seq: The input sequence :type in_seq: SeqRecord :param run: The number of runs that have been done :type run: int :param partial_ann: A partial annotation from a previous step :type partial_ann: :ref:`ann` :rtype: :ref:`ann` Example usage: >>> from Bio.Seq import Seq >>> from seqann.seq_search import SeqSearch >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') >>> sqsrch = SeqSearch() >>> ann = sqsrch.search_seqs(refseqs, inseq) """ # Extract out the sequences and feature names # from the reference sequences # The mapped features will be subtracted from seq_covered # so the final seq_covered number will reflect the remaining # number of base pairs that haven't been mapped. # # The coordinates and mapping will help determine what positions # in the sequence have been mapped and to what features. The # missing blocks variable will be generated using these. structures = get_structures() seq_covered = len(in_seq.seq) coordinates = dict( map(lambda x: [x, 1], [i for i in range(0, len(in_seq.seq) + 1)])) mapping = dict( map(lambda x: [x, 1], [i for i in range(0, len(in_seq.seq) + 1)])) ambig_map = {} found_feats = {} feat_missing = {} method = "nt_search" if not partial_ann else partial_ann.method # If the partial annotation is provided # then make the found_feats equal to # what has already been annotated feats = get_features(seqrec) if partial_ann: found_feats = partial_ann.features if self.verbose and self.verbosity > 4: self.logger.info("Found partial features:") for f in found_feats: self.logger.info(f) # Skip references that only have features # that have already been annoated if len([f for f in feats if f in found_feats]) == len(feats): if self.verbose: self.logger.info("Skipping incomplete refseq") return partial_ann if self.verbose and self.verbosity > 1: self.logger.info("Using partial annotation | " + locus + " " + str(len(partial_ann.features))) coordinates = dict( map(lambda l: [l, 1], [ item for sublist in partial_ann.blocks for item in sublist ])) seq_covered = partial_ann.covered mapping = partial_ann.mapping if self.verbose and self.verbosity > 2: self.logger.info("Partial sequence coverage = " + str(seq_covered)) self.logger.info("Partial sequence metho = " + method) added_feat = {} deleted_coords = {} for feat_name in sorted(feats, key=lambda k: structures[locus][k]): # skip if partial annotation is provided # and the feat name is not one of the # missing features if partial_ann and feat_name not in partial_ann.refmissing: if self.verbose and self.verbosity > 1: self.logger.info("Skipping " + feat_name + " - Already annotated") continue if self.verbose and self.verbosity > 1: self.logger.info("Running seqsearch for " + feat_name) # Search for the reference feature sequence in the # input sequence. Record the coordinates if it's # found and if it's found in multiple spots. If it # is not found, then record that feature as missing. seq_search = nt_search(str(in_seq.seq), str(feats[feat_name])) if len(seq_search) == 2: if self.verbose and self.verbosity > 0: self.logger.info("Found exact match for " + feat_name) seq_covered -= len(str(feats[feat_name])) end = int(len(str(feats[feat_name])) + seq_search[1]) if feat_name == 'three_prime_UTR' \ and len(str(in_seq.seq)) > end: end = len(str(in_seq.seq)) # If the feature is found and it's a five_prime_UTR then # the start should always be 0, so insertions at the # beinging of the sequence will be found. start = seq_search[1] if feat_name != 'five_prime_UTR' else 0 si = seq_search[1]+1 if seq_search[1] != 0 and \ feat_name != 'five_prime_UTR' else 0 # check if this features has already been mapped mapcheck = set( [0 if i in coordinates else 1 for i in range(si, end + 1)]) # Dont map features if they are out of order skip = False if found_feats and len(found_feats) > 0: for f in found_feats: o1 = structures[locus][feat_name] o2 = structures[locus][f] loctyp = loctype(found_feats[f].location.start, found_feats[f].location.end, start, end) if o1 < o2 and loctyp: skip = True if self.verbose: self.logger.info("Skipping map for " + feat_name) elif o2 < o1 and not loctyp: skip = True if self.verbose: self.logger.info("Skipping map for " + feat_name) if 1 not in mapcheck and not skip: for i in range(si, end + 1): if i in coordinates: if feat_name == "exon_8" or feat_name == 'three_prime_UTR': deleted_coords.update({i: coordinates[i]}) del coordinates[i] else: if self.verbose: self.logger.error( "seqsearch - should't be here " + locus + " - " + " - " + feat_name) mapping[i] = feat_name found_feats.update({ feat_name: SeqFeature(FeatureLocation(ExactPosition(start), ExactPosition(end), strand=1), type=feat_name) }) if feat_name == "exon_8" or feat_name == 'three_prime_UTR': added_feat.update({feat_name: feats[feat_name]}) if self.verbose and self.verbosity > 3: self.logger.info("Coordinates | Start = " + str(start) + " - End = " + str(end)) elif (len(seq_search) > 2): if self.verbose and self.verbosity > 1: self.logger.info("Found " + str(len(seq_search)) + " matches for " + feat_name) new_seq = [seq_search[0]] for i in range(1, len(seq_search)): tnp = seq_search[i] + 1 if seq_search[i] in coordinates or tnp in coordinates: new_seq.append(seq_search[i]) seq_search = new_seq if (partial_ann and feat_name == "exon_8" and run > 0): missing_feats = sorted(list(partial_ann.missing.keys())) # * HARD CODED LOGIC * # # > exon8 in class I maps to multiple spots in a sequence, # often in the 3' UTR. These features need to be mapped # last to make sure it's not mapping exon8 incorrectly. if (missing_feats == ['exon_8', 'three_prime_UTR'] and len(seq_search) <= 3): if self.verbose and self.verbosity > 0: self.logger.info("Resolving exon_8") seq_covered -= len(str(feats[feat_name])) end = int(len(str(feats[feat_name])) + seq_search[1]) # If the feature is found and it's a five_prime_UTR then # the start should always be 0, so insertions at the # beinging of the sequence will be found. start = seq_search[1] si = seq_search[1] + 1 if seq_search[1] != 0 else 0 # check if this features has already been mapped mapcheck = set([ 0 if i in coordinates else 1 for i in range(si, end + 1) ]) for i in range(si, end + 1): if i in coordinates: del coordinates[i] else: if self.verbose: self.logger.error( "seqsearch - should't be here " + locus + " - " + " - " + feat_name) mapping[i] = feat_name found_feats.update({ feat_name: SeqFeature(FeatureLocation(ExactPosition(start), ExactPosition(end), strand=1), type=feat_name) }) if self.verbose and self.verbosity > 0: self.logger.info("Coordinates | Start = " + str(start) + " - End = " + str(end)) else: if self.verbose and self.verbosity > 0: self.logger.info("Adding ambig feature " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) ambig_map.update( {feat_name: seq_search[1:len(seq_search)]}) else: if self.verbose and self.verbosity > 0: self.logger.info("Adding ambig feature " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) ambig_map.update( {feat_name: seq_search[1:len(seq_search)]}) else: if self.verbose and self.verbosity > 1: self.logger.info("No match for " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) blocks = getblocks(coordinates) exact_matches = list(found_feats.keys()) # * HARD CODED LOGIC * # # > # # HLA-DRB1 exon3 exact match - with intron1 and 3 missing if ('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1' and 'exon_2' in feat_missing and (len(blocks) == 1 or len(blocks) == 2)): for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): featname = "intron_3" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: featname = "exon_2" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0]), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) seq_covered -= len(b) if self.verbose and self.verbosity > 1: self.logger.info( "Successfully annotated class DRB1 II sequence") return Annotation(features=found_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, mapping=mapping, exact_match=exact_matches) # If it's a class II sequence and # exon_2 is an exact match # * HARD CODED LOGIC * # # > It's common for exon2 to be fully sequenced # but intron_2 and intron_1 to be partially sequenced, # which can make it hard to annotate those to features. # If there are two missing blocks that is small enough # and they are before and after exon2, then it's very # very likely to be intron_2 and intron_1. if 'exon_2' in exact_matches and len(blocks) == 2 \ and is_classII(locus) and seq_covered < 300: if self.verbose and self.verbosity > 1: self.logger.info("Running search for class II sequence") r = True for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): x = b[0] - 1 else: x += 1 f = mapping[x] if f != 'exon_2': r = False if r: for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): featname = "intron_2" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: featname = "intron_1" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0]), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) seq_covered -= len(b) if self.verbose and self.verbosity > 1: self.logger.info( "Successfully annotated class II sequence") return Annotation(features=found_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, mapping=mapping, exact_match=exact_matches) annotated_feats, mb, mapping = self._resolve_unmapped( blocks, feat_missing, ambig_map, mapping, found_feats, locus, seq_covered) # * HARD CODED LOGIC * # if (not mb and blocks and len(feat_missing.keys()) == 0 and len(ambig_map.keys()) == 0): mb = blocks if mb: # Unmap exon 8 if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 3000 \ and 'exon_8' in exact_matches: for i in deleted_coords: mapping[i] = 1 coordinates.update(deleted_coords) mb = getblocks(coordinates) feat_missing.update(added_feat) # Delte from found features del exact_matches[exact_matches.index('exon_8')] del found_feats['exon_8'] if 'exon_8' in annotated_feats: del annotated_feats['exon_8'] if 'three_prime_UTR' in found_feats: del found_feats['three_prime_UTR'] if 'three_prime_UTR' in annotated_feats: del annotated_feats['three_prime_UTR'] refmissing = [ f for f in structures[locus] if f not in annotated_feats ] if self.verbose and self.verbosity > 1: self.logger.info("* Annotation not complete *") # Print out what features were missing by the ref if self.verbose and self.verbosity > 2: self.logger.info("Refseq was missing these features = " + ",".join(list(refmissing))) # Print out what features were ambig matches if self.verbose and self.verbosity > 1 and len(ambig_map) > 1: self.logger.info("Features with ambig matches = " + ",".join(list(ambig_map))) # Print out what features were exact matches if self.verbose and self.verbosity > 2 and len(exact_matches) > 1: self.logger.info("Features exact matches = " + ",".join(list(exact_matches))) # Print out what features have been annotated if self.verbose and self.verbosity > 1 and len( annotated_feats) > 1: self.logger.info("Features annotated = " + ",".join(list(annotated_feats))) # Print out what features are missing if self.verbose and self.verbosity > 1 and len(feat_missing) > 1: self.logger.info("Features missing = " + ",".join(list(feat_missing))) annotation = Annotation(features=annotated_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, blocks=mb, method=method, refmissing=refmissing, mapping=mapping, exact_match=exact_matches, annotation=None) else: mb = None # Unmap exon 8 if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 600 \ and 'exon_8' in exact_matches \ and 'three_prime_UTR' in annotated_feats\ and 'three_prime_UTR' not in exact_matches: for i in deleted_coords: mapping[i] = 1 coordinates.update(deleted_coords) mb = getblocks(coordinates) feat_missing.update(added_feat) del exact_matches[exact_matches.index('exon_8')] del found_feats['exon_8'] if 'exon_8' in annotated_feats: del annotated_feats['exon_8'] if 'three_prime_UTR' in found_feats: del found_feats['three_prime_UTR'] if 'three_prime_UTR' in annotated_feats: del annotated_feats['three_prime_UTR'] if self.verbose: self.logger.info("* No missing blocks after seq_search *") # Print out what features were ambig matches if self.verbose and self.verbosity > 0 and len(ambig_map) > 1: self.logger.info("Features with ambig matches = " + ",".join(list(ambig_map))) # Print out what features were exact matches if self.verbose and self.verbosity > 0 and len(exact_matches) > 1: self.logger.info("Features exact matches = " + ",".join(list(exact_matches))) # Print out what features have been annotated if self.verbose and self.verbosity > 0 and len( annotated_feats) > 1: self.logger.info("Features annotated = " + ",".join(list(annotated_feats))) # Print out what features are missing if self.verbose and self.verbosity > 0 and len(feat_missing) > 1: self.logger.info("Features missing = " + ",".join(list(feat_missing))) annotation = Annotation(features=annotated_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, blocks=mb, mapping=mapping, exact_match=exact_matches, annotation=None) return annotation
def _resolve_unmapped(self, blocks, feat_missing, ambig_map, mapping, found_feats, loc, covered, rerun=False): structures = get_structures() struct_order = get_structorder() structure_max = get_structmax() exon_only = True found_exons = 0 for f in found_feats: if re.search("intron", f) or re.search("UTR", f): exon_only = False if re.search("exon", f): found_exons += 1 # Count the number of exons for the given loci num_exons = 0 for f in structures[loc]: if re.search("exon", f): num_exons += 1 # If all exons have been mapped # then it is not exon only data if found_exons == num_exons: exon_only = False # If it's exon only, then search two # features up rather than one add_num = 2 if exon_only and rerun and covered < 300 else 1 block_mapped = [] missing_blocks = [] for b in blocks: for featname in ambig_map.keys(): locats = ambig_map[featname] start_i = b[0] - 1 end_i = b[len(b) - 1] + 1 # TODO: Catch ERROR #if not end_i in mapping: feat_num = structures[loc][featname] x = feat_num - add_num y = feat_num - add_num if feat_num+add_num <= structure_max[loc] \ and feat_num-add_num >= 0 and start_i >= 0 \ and end_i <= len(mapping) - 1 \ and x in struct_order[loc] \ and y in struct_order[loc]: expected_p = struct_order[loc][feat_num - add_num] expected_n = struct_order[loc][feat_num + add_num] previous_feat = mapping[start_i] next_feat = mapping[end_i] if expected_p == previous_feat \ and expected_n == next_feat \ and expected_p != 1 \ and b[0]-1 in locats: block_mapped.append(b) found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) elif feat_num+add_num > structure_max[loc] \ and feat_num-add_num >= 0 and start_i >= 0 \ and end_i >= max(mapping) \ and y in struct_order[loc]: expected_p = struct_order[loc][feat_num - add_num] previous_feat = mapping[start_i] if expected_p == previous_feat \ and expected_p != 1 \ and b[0]-1 in locats: block_mapped.append(b) found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) elif feat_num+add_num <= structure_max[loc] \ and feat_num-add_num < 0\ and x in struct_order[loc]: expected_n = struct_order[loc][feat_num + add_num] if not end_i in mapping: next_feat = mapping[end_i - 1] else: next_feat = mapping[end_i] if expected_n == next_feat \ and expected_p != 1 \ and b[0]-1 in locats: block_mapped.append(b) found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: missing_blocks.append(b) for b in blocks: for featname in feat_missing.keys(): if featname in ambig_map and b not in block_mapped: if b not in missing_blocks: missing_blocks.append(b) continue if b not in block_mapped: #featlen = feat_missing[featname] start_i = b[0] - 1 end_i = b[len(b) - 1] + 1 feat_num = structures[loc][featname] if feat_num+add_num <= structure_max[loc] \ and feat_num-1 >= 1 \ and end_i <= max(mapping.keys()) \ and start_i >= 0 \ and feat_num-add_num > 0: expected_p = struct_order[loc][feat_num - add_num] expected_n = struct_order[loc][feat_num + add_num] previous_feat = mapping[start_i] next_feat = mapping[end_i] if expected_p == previous_feat \ and expected_n == next_feat \ and expected_p != 1 \ and expected_n != 1: if b in missing_blocks: del missing_blocks[missing_blocks.index(b)] for i in b: mapping.update({i: featname}) block_mapped.append(b) found_feats.update({ featname: SeqFeature(FeatureLocation( ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: if b not in missing_blocks: missing_blocks.append(b) elif feat_num+add_num > structure_max[loc] \ and feat_num-add_num >= 1 and start_i >= 0: expected_p = struct_order[loc][feat_num - add_num] previous_feat = mapping[start_i] if expected_p == previous_feat \ and expected_p != 1: if b in missing_blocks: del missing_blocks[missing_blocks.index(b)] block_mapped.append(b) for i in b: mapping.update({i: featname}) found_feats.update({ featname: SeqFeature(FeatureLocation( ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: if b not in missing_blocks: missing_blocks.append(b) elif (feat_num + add_num <= structure_max[loc] and feat_num - add_num < 1 and end_i <= max(mapping) and end_i in mapping): expected_n = struct_order[loc][feat_num + add_num] next_feat = mapping[end_i] if expected_n == next_feat: if b in missing_blocks: del missing_blocks[missing_blocks.index(b)] add = 0 block_mapped.append(b) for i in b: mapping.update({i: featname}) if add != 0: for i in range(b[len(b) - 1], b[len(b) - 1] + add): mapping.update({i: featname}) found_feats.update({ featname: SeqFeature(FeatureLocation( ExactPosition(b[0]), ExactPosition(b[len(b) - 1] + add), strand=1), type=featname) }) else: if b not in missing_blocks: missing_blocks.append(b) else: if b not in missing_blocks: missing_blocks.append(b) # If it failed to map all features when only looking # at the exons, then try again and look at all features if exon_only and not rerun and missing_blocks: if self.verbose: self.logger.info("Rerunning seqsearch to look at all features") return self._resolve_unmapped(missing_blocks, feat_missing, ambig_map, mapping, found_feats, loc, covered, rerun=True) else: return found_feats, missing_blocks, mapping