Ejemplo n.º 1
0
class Importer:
    def __init__(self):
        self._solr = Solr(Config(section='solr')['url'])
        self._mh = MediaHaven(buffer_size=100)

    def add(self, item):
        self._solr.add([item])

    def process(self, item):
        if item is None:
            raise Exception("Invalid item passed (None)")

        if type(item) is not str:
            pid = item['externalId']
        else:
            pid = item
            item = self._mh.one('+(externalId:%s)' % pid)

        if not pid:
            raise "No pid for item %s" % (item, )

        language = ''
        try:
            language = item['mdProperties']['language'][0].lower()
        except Exception as e:
            logger.warning('no language found for %s', pid)
            logger.exception(e)

        alto = self._mh.get_alto(item)
        if not alto:
            logger.debug("no alto for pid '%s' " % (pid, ))
            text = ''
        else:
            text = Conversions.normalize(alto.text)
        self.add(dict(id=pid, text=text, language=language))
Ejemplo n.º 2
0
    if not args.profile:
        logging.getLogger('pythonmodules.profiling').setLevel(logging.ERROR)

    samples = Samples(GMB())
    if args.train:
        with timeit('Creating NamedEntityChunker'):
            chunker = NamedEntityChunker(samples.training())
        pickle.dump(chunker, open(args.pickle, 'wb'))
    else:
        with timeit('Pickle load'):
            chunker = pickle.load(open(args.pickle, 'rb'))

    if args.test_mediahaven:
        with timeit('NER Tagging'):
            from pythonmodules.mediahaven import MediaHaven

            # from pythonmodules.config import Config
            mh = MediaHaven()
            item = mh.one('+(workflow:GMS) +(archiveStatus:on_tape)')
            print(chunker.parse(pos_tag(word_tokenize(item['description']))))

    if args.test:
        with timeit('Testing accuracy'):
            testsamples = samples.test(args.test)
            to_evaluate = (conlltags2tree([(w, t, iob)
                                           for (w, t), iob in iobs])
                           for iobs in tqdm(testsamples, total=args.test))
            score = chunker.evaluate(to_evaluate)
            print("Test accuracy = %.2f%% (tested using %d samples)" %
                  (score.accuracy() * 100, int(args.test)))
Ejemplo n.º 3
0
 def mediahaven(self):
     mh = MediaHaven(self._config)
     mh.one('+(externalId:f76639mh4g_19180801_0001)')
Ejemplo n.º 4
0
parser.add_argument('--debug', action='store_true', help='Show debug info')
parser.add_argument('--test-connection', action='store_true', help='Just do a connection test to MediaHaven')
parser.add_argument('--clear', action='store_true', help='Clear the table before inserting')
parser.add_argument('--continue', action='store_true', help='Continue from last inserted row')
parser.add_argument('--continue-from', help='Continue from row CONTINUE_FROM')
parser.add_argument('--table', help='The table to store the results in, default: %s' % table_name)
args = parser.parse_args()


mh = MediaHaven(config)

clear_db = args.clear

if args.test_connection:
    mh.refresh_token()
    print(type(mh.one()) is dict)
    exit()

db = create_engine(config['db']['connection_url'])
db.connect()
meta = MetaData(db, reflect=True)
if args.table:
    table_name = args.table
try:
    table = meta.tables[table_name]
except KeyError:
    raise FileNotFoundError('Couldnt find table "%s"' % table_name)

start = 0
if vars(args)['continue']:
    start = db.execute(func.max(table.c.doc_index)).scalar() + 1