def refresh_predictions(self, limit: int = 2000, batch_size: int = 1000) -> None: with sessionLock: samples: List[ClassificationSample] = list( ClassificationSample.query.find( dict(model=self.model_name)).sort([('seqHash', -1) ]).limit(limit)) seqs = [s.seq for s in samples] for i in range(0, len(seqs), batch_size): sample_probs = self.classify(seqs[i:i + batch_size]) with sessionLock: for i, seq in enumerate(seqs[i:i + batch_size]): sample: ClassificationSample = ( ClassificationSample.query.get(model=self.model_name, seqHash=hasher(seq))) if sample: sample.predicted_labels = ( Classifier.quality_to_predicted_labels( sample_probs[i])) else: print('ERROR: lost sample') session.flush() # This is harsh, but it seems otherwise some cache builds up # inside ming and eventually OOM's the application... # Thankfully, due to sessionLock this should be safe. session.clear()
def get_embedding(self, seqs: List[str]) -> List[np.array]: if len(seqs) == 0: return [] if len(seqs) > 5000: raise Exception( 'You should never handle more than 5000 berts at the same time!' ) hashed_seq_to_indices: Dict[str, List[int]] = {} for i, seq in enumerate(seqs): if hasher(seq) not in hashed_seq_to_indices: hashed_seq_to_indices[hasher(seq)] = [i] else: hashed_seq_to_indices[hasher(seq)].append(i) result: List[np.array] = [None] * len(seqs) # fetch from cache with sessionLock: for entry in Embedding.query.find( dict(bert=self.bert, seqHash={'$in': list(hashed_seq_to_indices.keys())}), projection=('seqHash', 'embedding')): for i in hashed_seq_to_indices[entry.seqHash]: result[i] = pickle.loads(entry.embedding) undone_seqs: List[str] = [] for seq in seqs: if result[hashed_seq_to_indices[hasher(seq)][0]] is None: undone_seqs.append(seq) self.logger.debug( 'Using %d of %d embedding matrices fetched from MongoDB.' % (len(seqs) - len(undone_seqs), len(seqs))) if len(undone_seqs) == 0: return result self.logger.info('Building %d embedding matrices with TensorFlow...' % (len(undone_seqs))) done_seqs = self._build_embedding(undone_seqs) with sessionLock: for seq, matrix in zip(undone_seqs, done_seqs): seqHash = hasher(seq) for i in hashed_seq_to_indices[seqHash]: result[i] = matrix # Prevent duplicate key errors since another thread might # have added this embedding. if not Embedding.query.get(bert=self.bert, seqHash=seqHash): # convert npArray to list for storage in MongoDB Embedding(bert=self.bert, seq=seq, seqHash=seqHash, embedding=pickle.dumps(matrix)) try: session.flush() except DuplicateKeyError: pass self.logger.info('Stored %d embedding matrices in MongoDB.' % len(done_seqs)) return result
def importData(path: str, text_col: str, label_col: str, sharedId_col: str) -> None: with open(path, 'r') as csvFile, sessionLock: for row in csv.DictReader(csvFile): seq = row[text_col] seqHash = hasher(seq) training_labels: List[Dict[str, float]] = [] if label_col != '': training_label_list = eval(row[label_col]) training_labels = [dict(topic=l) for l in training_label_list] sharedId = '' if sharedId_col != '': sharedId = row[sharedId_col] existing: ClassificationSample = ClassificationSample.query.get( model=FLAGS.model, seqHash=seqHash) if not existing: existing = ClassificationSample( model=FLAGS.model, seq=seq, seqHash=seqHash, training_labels=training_labels, sharedId=sharedId) else: if label_col != '': existing.training_labels = training_labels if sharedId_col != '': existing.sharedId = sharedId existing.use_for_training = len(training_labels) > 0 session.flush()
def delete_samples() -> Any: # request.args: &model=upr-info_issues&seq=* args = request.args if not args['model']: raise Exception('You need to pass &model=...') if not args['seq']: raise Exception('You need to pass &seq=...') with sessionLock: if args['seq'] == '*': ClassificationSample.query.remove({'model': args['model']}) else: ClassificationSample.query.remove({ 'model': args['model'], 'seqHash': hasher(args['seq']) }) session.flush() return jsonify({})
def importData(path: str, text_col: str, label_col: str, sharedId_col: str) -> None: with open(path, 'r') as csvFile, sessionLock: newly_created: int = 0 updated: int = 0 for row in csv.DictReader(csvFile): seq = row[text_col] seqHash = hasher(seq) training_labels: List[Dict[str, float]] = [] if label_col != '': training_label_list = eval(row[label_col]) training_labels = [dict(topic=l) for l in training_label_list] sharedId = '' if sharedId_col != '': sharedId = row[sharedId_col] existing: ClassificationSample = ClassificationSample.query.get( model=FLAGS.model, seqHash=seqHash) if not existing: existing = ClassificationSample( model=FLAGS.model, seq=seq, seqHash=seqHash, training_labels=training_labels, sharedId=sharedId) newly_created += 1 else: if label_col != '': existing.training_labels = training_labels if sharedId_col != '': existing.sharedId = sharedId if label_col != '' or sharedId_col != '': updated += 1 existing.use_for_training = len(training_labels) > 0 print( 'CSV Data Import: \nNew created entries: {}\nUpdated entries: {}'. format(newly_created, updated)) session.flush()
def add_samples() -> Any: # request.args: &model=upr-info_issues # request.get_json: {'samples': [{'seq': 'hello world', # 'sharedId': 'asda12', # 'training_labels'?: [ # {'topic': 'Murder'}, # {'topic': 'Justice'}]}, # ...], # 'refresh_predictions': true } # returns {'samples': [{'seq': '', # 'sharedId': 'asda12', # 'predicted_labels': [...]}]} data = request.get_json() args = request.args if not args['model']: raise Exception('You need to pass &model=...') processed: Set[str] = set() response = [] c = ClassifierCache.get(app.config['BASE_CLASSIFIER_DIR'], args['model']) refresh_predictions = (data['refresh_predictions'] if 'refresh_predictions' in data else False) seq_hash_to_seq_index: Dict[str, int] = {} seqs_to_classify: List[str] = [] for i, sample in enumerate(data['samples']): if not sample['seq']: continue seqHash = hasher(sample['seq']) if seqHash in seq_hash_to_seq_index: continue with sessionLock: existing1: ClassificationSample = ClassificationSample.query.get( model=args['model'], seqHash=seqHash) if (refresh_predictions or not existing1 or not existing1.predicted_labels): seqs_to_classify.append(sample['seq']) seq_hash_to_seq_index[seqHash] = len(seqs_to_classify) - 1 classified_seqs: List[Dict[str, float]] if seqs_to_classify: classified_seqs = c.classify(seqs_to_classify) for i, sample in enumerate(data['samples']): if not sample['seq']: continue seqHash = hasher(sample['seq']) sharedId = (sample['sharedId'] if 'sharedId' in sample else '') sample_labels = (sample['training_labels'] if 'training_labels' in sample else []) with sessionLock: existing: ClassificationSample = ClassificationSample.query.get( model=args['model'], seqHash=seqHash) if existing: response_sample = existing if 'training_labels' in sample: existing.training_labels = sample_labels existing.use_for_training = len(sample_labels) > 0 if 'sharedId' in sample: existing.sharedId = sharedId elif seqHash not in processed: response_sample = ClassificationSample( model=args['model'], seq=sample['seq'], seqHash=seqHash, training_labels=sample_labels, sharedId=sharedId, use_for_training=len(sample_labels) > 0) session.flush() processed.add(seqHash) if response_sample: if not response_sample.predicted_labels or refresh_predictions: predicted_labels = (Classifier.quality_to_predicted_labels( classified_seqs[seq_hash_to_seq_index[seqHash]])) with sessionLock: response_sample.predicted_labels = predicted_labels session.flush() response.append( dict(seq='' if sharedId else sample['seq'], sharedId=sharedId, predicted_labels=response_sample.predicted_labels)) with sessionLock: session.clear() return jsonify(dict(samples=response))