def refresh_predictions(self,
                            limit: int = 2000,
                            batch_size: int = 1000) -> None:
        with sessionLock:
            samples: List[ClassificationSample] = list(
                ClassificationSample.query.find(
                    dict(model=self.model_name)).sort([('seqHash', -1)
                                                       ]).limit(limit))
            seqs = [s.seq for s in samples]

        for i in range(0, len(seqs), batch_size):
            sample_probs = self.classify(seqs[i:i + batch_size])

            with sessionLock:
                for i, seq in enumerate(seqs[i:i + batch_size]):
                    sample: ClassificationSample = (
                        ClassificationSample.query.get(model=self.model_name,
                                                       seqHash=hasher(seq)))
                    if sample:
                        sample.predicted_labels = (
                            Classifier.quality_to_predicted_labels(
                                sample_probs[i]))
                    else:
                        print('ERROR: lost sample')
                session.flush()
                # This is harsh, but it seems otherwise some cache builds up
                # inside ming and eventually OOM's the application...
                # Thankfully, due to sessionLock this should be safe.
                session.clear()
    def get_embedding(self, seqs: List[str]) -> List[np.array]:
        if len(seqs) == 0:
            return []
        if len(seqs) > 5000:
            raise Exception(
                'You should never handle more than 5000 berts at the same time!'
            )

        hashed_seq_to_indices: Dict[str, List[int]] = {}
        for i, seq in enumerate(seqs):
            if hasher(seq) not in hashed_seq_to_indices:
                hashed_seq_to_indices[hasher(seq)] = [i]
            else:
                hashed_seq_to_indices[hasher(seq)].append(i)

        result: List[np.array] = [None] * len(seqs)
        # fetch from cache
        with sessionLock:
            for entry in Embedding.query.find(
                    dict(bert=self.bert,
                         seqHash={'$in': list(hashed_seq_to_indices.keys())}),
                    projection=('seqHash', 'embedding')):
                for i in hashed_seq_to_indices[entry.seqHash]:
                    result[i] = pickle.loads(entry.embedding)

        undone_seqs: List[str] = []
        for seq in seqs:
            if result[hashed_seq_to_indices[hasher(seq)][0]] is None:
                undone_seqs.append(seq)

        self.logger.debug(
            'Using %d of %d embedding matrices fetched from MongoDB.' %
            (len(seqs) - len(undone_seqs), len(seqs)))
        if len(undone_seqs) == 0:
            return result

        self.logger.info('Building %d embedding matrices with TensorFlow...' %
                         (len(undone_seqs)))
        done_seqs = self._build_embedding(undone_seqs)

        with sessionLock:
            for seq, matrix in zip(undone_seqs, done_seqs):
                seqHash = hasher(seq)
                for i in hashed_seq_to_indices[seqHash]:
                    result[i] = matrix
                # Prevent duplicate key errors since another thread might
                # have added this embedding.
                if not Embedding.query.get(bert=self.bert, seqHash=seqHash):
                    # convert npArray to list for storage in MongoDB
                    Embedding(bert=self.bert,
                              seq=seq,
                              seqHash=seqHash,
                              embedding=pickle.dumps(matrix))
            try:
                session.flush()
            except DuplicateKeyError:
                pass
        self.logger.info('Stored %d embedding matrices in MongoDB.' %
                         len(done_seqs))
        return result
Exemple #3
0
def importData(path: str, text_col: str, label_col: str,
               sharedId_col: str) -> None:
    with open(path, 'r') as csvFile, sessionLock:
        for row in csv.DictReader(csvFile):
            seq = row[text_col]
            seqHash = hasher(seq)

            training_labels: List[Dict[str, float]] = []
            if label_col != '':
                training_label_list = eval(row[label_col])
                training_labels = [dict(topic=l) for l in training_label_list]

            sharedId = ''
            if sharedId_col != '':
                sharedId = row[sharedId_col]

            existing: ClassificationSample = ClassificationSample.query.get(
                model=FLAGS.model, seqHash=seqHash)
            if not existing:
                existing = ClassificationSample(
                    model=FLAGS.model,
                    seq=seq,
                    seqHash=seqHash,
                    training_labels=training_labels,
                    sharedId=sharedId)
            else:
                if label_col != '':
                    existing.training_labels = training_labels
                if sharedId_col != '':
                    existing.sharedId = sharedId
            existing.use_for_training = len(training_labels) > 0
        session.flush()
def delete_samples() -> Any:
    # request.args: &model=upr-info_issues&seq=*
    args = request.args

    if not args['model']:
        raise Exception('You need to pass &model=...')
    if not args['seq']:
        raise Exception('You need to pass &seq=...')
    with sessionLock:
        if args['seq'] == '*':
            ClassificationSample.query.remove({'model': args['model']})
        else:
            ClassificationSample.query.remove({
                'model': args['model'],
                'seqHash': hasher(args['seq'])
            })
        session.flush()
    return jsonify({})
Exemple #5
0
def importData(path: str, text_col: str, label_col: str,
               sharedId_col: str) -> None:
    with open(path, 'r') as csvFile, sessionLock:
        newly_created: int = 0
        updated: int = 0
        for row in csv.DictReader(csvFile):
            seq = row[text_col]
            seqHash = hasher(seq)

            training_labels: List[Dict[str, float]] = []
            if label_col != '':
                training_label_list = eval(row[label_col])
                training_labels = [dict(topic=l) for l in training_label_list]

            sharedId = ''
            if sharedId_col != '':
                sharedId = row[sharedId_col]

            existing: ClassificationSample = ClassificationSample.query.get(
                model=FLAGS.model, seqHash=seqHash)
            if not existing:
                existing = ClassificationSample(
                    model=FLAGS.model,
                    seq=seq,
                    seqHash=seqHash,
                    training_labels=training_labels,
                    sharedId=sharedId)
                newly_created += 1
            else:
                if label_col != '':
                    existing.training_labels = training_labels
                if sharedId_col != '':
                    existing.sharedId = sharedId
                if label_col != '' or sharedId_col != '':
                    updated += 1

            existing.use_for_training = len(training_labels) > 0
        print(
            'CSV Data Import: \nNew created entries: {}\nUpdated entries: {}'.
            format(newly_created, updated))
        session.flush()
def add_samples() -> Any:
    # request.args: &model=upr-info_issues
    # request.get_json: {'samples': [{'seq': 'hello world',
    #                                 'sharedId': 'asda12',
    #                                 'training_labels'?: [
    #                                     {'topic': 'Murder'},
    #                                     {'topic': 'Justice'}]},
    #                                ...],
    #                    'refresh_predictions': true }
    # returns {'samples': [{'seq': '',
    #                       'sharedId': 'asda12',
    #                       'predicted_labels': [...]}]}
    data = request.get_json()
    args = request.args

    if not args['model']:
        raise Exception('You need to pass &model=...')

    processed: Set[str] = set()
    response = []

    c = ClassifierCache.get(app.config['BASE_CLASSIFIER_DIR'], args['model'])

    refresh_predictions = (data['refresh_predictions']
                           if 'refresh_predictions' in data else False)

    seq_hash_to_seq_index: Dict[str, int] = {}
    seqs_to_classify: List[str] = []

    for i, sample in enumerate(data['samples']):
        if not sample['seq']:
            continue
        seqHash = hasher(sample['seq'])
        if seqHash in seq_hash_to_seq_index:
            continue
        with sessionLock:
            existing1: ClassificationSample = ClassificationSample.query.get(
                model=args['model'], seqHash=seqHash)
            if (refresh_predictions or not existing1
                    or not existing1.predicted_labels):
                seqs_to_classify.append(sample['seq'])
                seq_hash_to_seq_index[seqHash] = len(seqs_to_classify) - 1

    classified_seqs: List[Dict[str, float]]
    if seqs_to_classify:
        classified_seqs = c.classify(seqs_to_classify)

    for i, sample in enumerate(data['samples']):
        if not sample['seq']:
            continue
        seqHash = hasher(sample['seq'])
        sharedId = (sample['sharedId'] if 'sharedId' in sample else '')
        sample_labels = (sample['training_labels']
                         if 'training_labels' in sample else [])

        with sessionLock:
            existing: ClassificationSample = ClassificationSample.query.get(
                model=args['model'], seqHash=seqHash)
            if existing:
                response_sample = existing
                if 'training_labels' in sample:
                    existing.training_labels = sample_labels
                    existing.use_for_training = len(sample_labels) > 0
                if 'sharedId' in sample:
                    existing.sharedId = sharedId
            elif seqHash not in processed:
                response_sample = ClassificationSample(
                    model=args['model'],
                    seq=sample['seq'],
                    seqHash=seqHash,
                    training_labels=sample_labels,
                    sharedId=sharedId,
                    use_for_training=len(sample_labels) > 0)
            session.flush()
        processed.add(seqHash)
        if response_sample:
            if not response_sample.predicted_labels or refresh_predictions:
                predicted_labels = (Classifier.quality_to_predicted_labels(
                    classified_seqs[seq_hash_to_seq_index[seqHash]]))
                with sessionLock:
                    response_sample.predicted_labels = predicted_labels
                    session.flush()

            response.append(
                dict(seq='' if sharedId else sample['seq'],
                     sharedId=sharedId,
                     predicted_labels=response_sample.predicted_labels))
    with sessionLock:
        session.clear()
    return jsonify(dict(samples=response))