def get_info(pid, words=None, extra_previews=True): def b64img(img): data = io.BytesIO() img.save(data, format='JPEG', quality=85) return base64.b64encode(data.getvalue()).decode() mh = MediaHaven() alto = mh.get_alto(pid) result = dict( pid=pid, words=len(words) if words is not None else 0, alto=alto.search_words(words), alto_link=alto.url ) result['ocr_text'] = alto.text with mh.get_preview(pid) as im: result['previewImageUrl'] = im.meta['previewImagePath'] result['meta'] = im.meta if result['words'] > 0 and extra_previews: result['preview_full'] = b64img(im.highlight_words(words, crop=False)) result['preview'] = b64img(im.highlight_words(words)) result['props'] = im.meta['mdProperties'] return result
class Importer: def __init__(self): self._solr = Solr(Config(section='solr')['url']) self._mh = MediaHaven(buffer_size=100) def add(self, item): self._solr.add([item]) def process(self, item): if item is None: raise Exception("Invalid item passed (None)") if type(item) is not str: pid = item['externalId'] else: pid = item item = self._mh.one('+(externalId:%s)' % pid) if not pid: raise "No pid for item %s" % (item, ) language = '' try: language = item['mdProperties']['language'][0].lower() except Exception as e: logger.warning('no language found for %s', pid) logger.exception(e) alto = self._mh.get_alto(item) if not alto: logger.debug("no alto for pid '%s' " % (pid, )) text = '' else: text = Conversions.normalize(alto.text) self.add(dict(id=pid, text=text, language=language))
def oai(self): mh = MediaHaven(self._config) fragment_id = 'EeZNC2b9TeYMKMQRNcVJnumk' try: mh.oai().GetRecord(identifier='umid:%s' % fragment_id, metadataPrefix='mets') except IdDoesNotExist: pass
def _stats_pcts(self): mh = MediaHaven() nl_count = len(Datasources['namenlijst']['func']()) mh_count = len(mh.search('+(workflow:GMS) +(archiveStatus:on_tape)')) data = OrderedDict({ '': 'COUNT(*)', 'names from IFFM namenlijst': ('COUNT(DISTINCT nmlid)', nl_count), 'newspaper pages': ('COUNT(DISTINCT pid)', mh_count), }) for k, v in data.items(): total = None if type(v) is tuple: total = v[1] v = v[0] args = (v, self.table, self.model.SKIP) res = self.db.execute('SELECT %s FROM %s WHERE status != %d' % args) matches = int(res.scalar()) res = self.db.execute( 'SELECT %s FROM %s WHERE status != %d and score > 0' % args) matches_with_score = int(res.scalar()) counts = [ matches, matches_with_score, matches_with_score / matches, ] if total is not None: counts.append(total) counts.append(matches / total) counts.append(matches_with_score / total) data[k] = counts return data
if not args.profile: logging.getLogger('pythonmodules.profiling').setLevel(logging.ERROR) samples = Samples(GMB()) if args.train: with timeit('Creating NamedEntityChunker'): chunker = NamedEntityChunker(samples.training()) pickle.dump(chunker, open(args.pickle, 'wb')) else: with timeit('Pickle load'): chunker = pickle.load(open(args.pickle, 'rb')) if args.test_mediahaven: with timeit('NER Tagging'): from pythonmodules.mediahaven import MediaHaven # from pythonmodules.config import Config mh = MediaHaven() item = mh.one('+(workflow:GMS) +(archiveStatus:on_tape)') print(chunker.parse(pos_tag(word_tokenize(item['description'])))) if args.test: with timeit('Testing accuracy'): testsamples = samples.test(args.test) to_evaluate = (conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in tqdm(testsamples, total=args.test)) score = chunker.evaluate(to_evaluate) print("Test accuracy = %.2f%% (tested using %d samples)" % (score.accuracy() * 100, int(args.test)))
def mediahaven(self): mh = MediaHaven(self._config) mh.one('+(externalId:f76639mh4g_19180801_0001)')
logger.propagate = True config = Config() table_name = config['db']['table_name'] parser = ArgumentParser(description='Use NER to look for entities and save') parser.add_argument('--debug', action='store_true', help='Show debug info') parser.add_argument('--test-connection', action='store_true', help='Just do a connection test to MediaHaven') parser.add_argument('--clear', action='store_true', help='Clear the table before inserting') parser.add_argument('--continue', action='store_true', help='Continue from last inserted row') parser.add_argument('--continue-from', help='Continue from row CONTINUE_FROM') parser.add_argument('--table', help='The table to store the results in, default: %s' % table_name) args = parser.parse_args() mh = MediaHaven(config) clear_db = args.clear if args.test_connection: mh.refresh_token() print(type(mh.one()) is dict) exit() db = create_engine(config['db']['connection_url']) db.connect() meta = MetaData(db, reflect=True) if args.table: table_name = args.table try: table = meta.tables[table_name]
def __init__(self): self._solr = Solr(Config(section='solr')['url']) self._mh = MediaHaven(buffer_size=100)
def __init__(self, config=None, force_regen=False): self.nl = Namenlijst(config) self.mh = MediaHaven(config) self.force_regen = force_regen