Example #1
0
def get_info(pid, words=None, extra_previews=True):
    def b64img(img):
        data = io.BytesIO()
        img.save(data, format='JPEG', quality=85)
        return base64.b64encode(data.getvalue()).decode()

    mh = MediaHaven()
    alto = mh.get_alto(pid)
    result = dict(
        pid=pid,
        words=len(words) if words is not None else 0,
        alto=alto.search_words(words),
        alto_link=alto.url
    )
    result['ocr_text'] = alto.text

    with mh.get_preview(pid) as im:
        result['previewImageUrl'] = im.meta['previewImagePath']
        result['meta'] = im.meta

        if result['words'] > 0 and extra_previews:
            result['preview_full'] = b64img(im.highlight_words(words, crop=False))
            result['preview'] = b64img(im.highlight_words(words))

        result['props'] = im.meta['mdProperties']

    return result
Example #2
0
class Importer:
    def __init__(self):
        self._solr = Solr(Config(section='solr')['url'])
        self._mh = MediaHaven(buffer_size=100)

    def add(self, item):
        self._solr.add([item])

    def process(self, item):
        if item is None:
            raise Exception("Invalid item passed (None)")

        if type(item) is not str:
            pid = item['externalId']
        else:
            pid = item
            item = self._mh.one('+(externalId:%s)' % pid)

        if not pid:
            raise "No pid for item %s" % (item, )

        language = ''
        try:
            language = item['mdProperties']['language'][0].lower()
        except Exception as e:
            logger.warning('no language found for %s', pid)
            logger.exception(e)

        alto = self._mh.get_alto(item)
        if not alto:
            logger.debug("no alto for pid '%s' " % (pid, ))
            text = ''
        else:
            text = Conversions.normalize(alto.text)
        self.add(dict(id=pid, text=text, language=language))
Example #3
0
 def oai(self):
     mh = MediaHaven(self._config)
     fragment_id = 'EeZNC2b9TeYMKMQRNcVJnumk'
     try:
         mh.oai().GetRecord(identifier='umid:%s' % fragment_id,
                            metadataPrefix='mets')
     except IdDoesNotExist:
         pass
Example #4
0
    def _stats_pcts(self):
        mh = MediaHaven()

        nl_count = len(Datasources['namenlijst']['func']())
        mh_count = len(mh.search('+(workflow:GMS) +(archiveStatus:on_tape)'))

        data = OrderedDict({
            '':
            'COUNT(*)',
            'names from IFFM namenlijst': ('COUNT(DISTINCT nmlid)', nl_count),
            'newspaper pages': ('COUNT(DISTINCT pid)', mh_count),
        })

        for k, v in data.items():
            total = None
            if type(v) is tuple:
                total = v[1]
                v = v[0]

            args = (v, self.table, self.model.SKIP)
            res = self.db.execute('SELECT %s FROM %s WHERE status != %d' %
                                  args)
            matches = int(res.scalar())

            res = self.db.execute(
                'SELECT %s FROM %s WHERE status != %d and score > 0' % args)
            matches_with_score = int(res.scalar())
            counts = [
                matches,
                matches_with_score,
                matches_with_score / matches,
            ]

            if total is not None:
                counts.append(total)
                counts.append(matches / total)
                counts.append(matches_with_score / total)
            data[k] = counts

        return data
Example #5
0
    if not args.profile:
        logging.getLogger('pythonmodules.profiling').setLevel(logging.ERROR)

    samples = Samples(GMB())
    if args.train:
        with timeit('Creating NamedEntityChunker'):
            chunker = NamedEntityChunker(samples.training())
        pickle.dump(chunker, open(args.pickle, 'wb'))
    else:
        with timeit('Pickle load'):
            chunker = pickle.load(open(args.pickle, 'rb'))

    if args.test_mediahaven:
        with timeit('NER Tagging'):
            from pythonmodules.mediahaven import MediaHaven

            # from pythonmodules.config import Config
            mh = MediaHaven()
            item = mh.one('+(workflow:GMS) +(archiveStatus:on_tape)')
            print(chunker.parse(pos_tag(word_tokenize(item['description']))))

    if args.test:
        with timeit('Testing accuracy'):
            testsamples = samples.test(args.test)
            to_evaluate = (conlltags2tree([(w, t, iob)
                                           for (w, t), iob in iobs])
                           for iobs in tqdm(testsamples, total=args.test))
            score = chunker.evaluate(to_evaluate)
            print("Test accuracy = %.2f%% (tested using %d samples)" %
                  (score.accuracy() * 100, int(args.test)))
Example #6
0
 def mediahaven(self):
     mh = MediaHaven(self._config)
     mh.one('+(externalId:f76639mh4g_19180801_0001)')
Example #7
0
logger.propagate = True


config = Config()
table_name = config['db']['table_name']
parser = ArgumentParser(description='Use NER to look for entities and save')
parser.add_argument('--debug', action='store_true', help='Show debug info')
parser.add_argument('--test-connection', action='store_true', help='Just do a connection test to MediaHaven')
parser.add_argument('--clear', action='store_true', help='Clear the table before inserting')
parser.add_argument('--continue', action='store_true', help='Continue from last inserted row')
parser.add_argument('--continue-from', help='Continue from row CONTINUE_FROM')
parser.add_argument('--table', help='The table to store the results in, default: %s' % table_name)
args = parser.parse_args()


mh = MediaHaven(config)

clear_db = args.clear

if args.test_connection:
    mh.refresh_token()
    print(type(mh.one()) is dict)
    exit()

db = create_engine(config['db']['connection_url'])
db.connect()
meta = MetaData(db, reflect=True)
if args.table:
    table_name = args.table
try:
    table = meta.tables[table_name]
Example #8
0
 def __init__(self):
     self._solr = Solr(Config(section='solr')['url'])
     self._mh = MediaHaven(buffer_size=100)
Example #9
0
 def __init__(self, config=None, force_regen=False):
     self.nl = Namenlijst(config)
     self.mh = MediaHaven(config)
     self.force_regen = force_regen