class Importer: def __init__(self): self._solr = Solr(Config(section='solr')['url']) self._mh = MediaHaven(buffer_size=100) def add(self, item): self._solr.add([item]) def process(self, item): if item is None: raise Exception("Invalid item passed (None)") if type(item) is not str: pid = item['externalId'] else: pid = item item = self._mh.one('+(externalId:%s)' % pid) if not pid: raise "No pid for item %s" % (item, ) language = '' try: language = item['mdProperties']['language'][0].lower() except Exception as e: logger.warning('no language found for %s', pid) logger.exception(e) alto = self._mh.get_alto(item) if not alto: logger.debug("no alto for pid '%s' " % (pid, )) text = '' else: text = Conversions.normalize(alto.text) self.add(dict(id=pid, text=text, language=language))
if not args.profile: logging.getLogger('pythonmodules.profiling').setLevel(logging.ERROR) samples = Samples(GMB()) if args.train: with timeit('Creating NamedEntityChunker'): chunker = NamedEntityChunker(samples.training()) pickle.dump(chunker, open(args.pickle, 'wb')) else: with timeit('Pickle load'): chunker = pickle.load(open(args.pickle, 'rb')) if args.test_mediahaven: with timeit('NER Tagging'): from pythonmodules.mediahaven import MediaHaven # from pythonmodules.config import Config mh = MediaHaven() item = mh.one('+(workflow:GMS) +(archiveStatus:on_tape)') print(chunker.parse(pos_tag(word_tokenize(item['description'])))) if args.test: with timeit('Testing accuracy'): testsamples = samples.test(args.test) to_evaluate = (conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in tqdm(testsamples, total=args.test)) score = chunker.evaluate(to_evaluate) print("Test accuracy = %.2f%% (tested using %d samples)" % (score.accuracy() * 100, int(args.test)))
def mediahaven(self): mh = MediaHaven(self._config) mh.one('+(externalId:f76639mh4g_19180801_0001)')
parser.add_argument('--debug', action='store_true', help='Show debug info') parser.add_argument('--test-connection', action='store_true', help='Just do a connection test to MediaHaven') parser.add_argument('--clear', action='store_true', help='Clear the table before inserting') parser.add_argument('--continue', action='store_true', help='Continue from last inserted row') parser.add_argument('--continue-from', help='Continue from row CONTINUE_FROM') parser.add_argument('--table', help='The table to store the results in, default: %s' % table_name) args = parser.parse_args() mh = MediaHaven(config) clear_db = args.clear if args.test_connection: mh.refresh_token() print(type(mh.one()) is dict) exit() db = create_engine(config['db']['connection_url']) db.connect() meta = MetaData(db, reflect=True) if args.table: table_name = args.table try: table = meta.tables[table_name] except KeyError: raise FileNotFoundError('Couldnt find table "%s"' % table_name) start = 0 if vars(args)['continue']: start = db.execute(func.max(table.c.doc_index)).scalar() + 1