def connect(self): if self._connected: return self._db = create_engine(self._connection_url) with timeit('connect db %s' % self._debug_connection_url, 1000): self._db.connect() self._connected = True
type=int, help='Test the tagger with TEST known examples (default 500)') parser.add_argument('--test-mediahaven', action='store_true', help='Tag MediaHaven newspaper OCR-text') parser.add_argument('--profile', action='store_true', help='Output run times of some key operations') parser.add_argument(dest='pickle', help='Filename of pickle file') args = parser.parse_args() if not args.profile: logging.getLogger('pythonmodules.profiling').setLevel(logging.ERROR) samples = Samples(GMB()) if args.train: with timeit('Creating NamedEntityChunker'): chunker = NamedEntityChunker(samples.training()) pickle.dump(chunker, open(args.pickle, 'wb')) else: with timeit('Pickle load'): chunker = pickle.load(open(args.pickle, 'rb')) if args.test_mediahaven: with timeit('NER Tagging'): from pythonmodules.mediahaven import MediaHaven # from pythonmodules.config import Config mh = MediaHaven() item = mh.one('+(workflow:GMS) +(archiveStatus:on_tape)') print(chunker.parse(pos_tag(word_tokenize(item['description']))))
help='Extra where clause to pass to the select query (eg. "status=1")') args = parser.parse_args() logging.basicConfig() logger = logging.getLogger() fh = logging.FileHandler(args.log_file) fh.setLevel(logging.INFO) logger.addHandler(fh) _solr = Solr(Config(section='solr')['url']) config = Config(section='db') conn = psycopg2.connect(config['connection_url']) conn2 = psycopg2.connect(config['connection_url']) cur = conn.cursor() with timeit('SELECT', 5000): where_clause = ('WHERE %s' % args.where) if args.where else '' q = 'SELECT pid, nmlid, entity, id, score FROM %s %s ORDER BY pid ASC' % ( args.table, where_clause) if args.limit: q += ' LIMIT %d' % int(args.limit) if args.start: q += ' OFFSET %d' % int(args.start) cur.execute(q) @multithreaded(10, pre_start=True, pass_thread_id=False) def process(row): try: res = _solr.search('id:%s' % row[0], rows=1, fl=['text', 'language'])
def process_pid(row): try: id_, full_pid, external_id, entity, score, meta = row entity = remove_double_spaces.sub(' ', entity) with timeit('Rater init', 1e3): rater = Rater(full_pid, external_id, entity) cur_rating = score meta_old = None if meta: meta_old = meta try: meta = get_meta(full_pid, external_id, entity, score, meta) except KeyError as e: logger.warning(e) with conn.cursor() as cur2: cur2.execute( 'UPDATE ' + args.table + ' SET status=4 WHERE id=%s', [id_]) # conn.commit() new_score = 0 try: rating = rater.ratings() meta['rating_breakdown'] = { k: rating.scores[k].rating for k in rating.scores } meta['rating_multiplier'] = rating.total_multiplier new_score = rating.total except KeyError as e: logger.warning(e) toset = dict() if not isclose(cur_rating, new_score, abs_tol=.0001): toset['score'] = new_score if 'quality' in meta: del meta['quality'] meta = json.dumps(meta) if meta_to_comp_meta(meta) != meta_to_comp_meta(meta_old): toset['meta'] = meta if not len(toset): return with timeit('SLOW UPDATE %s' % id_, 1e3), conn.cursor() as cur2: keys = ','.join([k + ' = %s' for k in toset.keys()]) values = [toset[k] for k in toset.keys()] values.append(id_) cur2.execute( 'UPDATE ' + args.table + ' SET ' + keys + ' WHERE id=%s', values) # conn2.commit() except Exception as e: try: url = 'http://do-tst-mke-01.do.viaa.be/attestation/info/model-%s/%s/%s/%s' % \ (model_name, full_pid, external_id, entity.replace(' ', '/')) except Exception as e2: url = str(e2) logger.warning('exception for %s', url) logger.exception(e)
def connect(self): super().connect() with timeit('reflect db %s' % self._debug_connection_url, 1000): self._meta = MetaData(bind=self._db) self._meta.reflect()
def test(self, amount=500): # known_tags = NER.allowed_tags samples = self.corpus.read_entities() if amount == 0: logger.info("Loading the entire corpus in memory, this may take a while...") # preload all samples to have an amount available for progress indicator samples = list(samples) amount = len(samples) totals = [defaultdict(lambda: 0) for i in range(len(self.taggers))] # stats = [defaultdict(lambda: defaultdict(lambda: 0)) for i in range(len(self.taggers))] total_tags = 0 progress = tqdm(total=amount*len(self.taggers)) timer = timeit() full_orig_tags = [] full_predict_tags = [[] for i in range(len(self.taggers))] for sample_index, sample in enumerate(samples): if amount and sample_index >= amount: break sample_tags = Tester.filter_tags(sample.entities) orig_tags = [tag[1] for tag in sample_tags] ntags = len(orig_tags) total_tags += ntags full_orig_tags.extend(orig_tags) for n, tagger in enumerate(self.taggers): progress.update() cls = type(tagger).__name__ timer.restart() tags = Tester.filter_tags(tagger.tag(sample.phrase)) elapsed = timer.elapsed() # check and do a naive attempt to fix different tag lengths if len(tags) != ntags: tags = Tester.fix_tags_counts(tags, sample_tags) if len(tags) != ntags: logger.error("Samples are of different size for %s (%d vs %d): \nTAGGER: %s\nORIGINAL: %s", cls, len(tags), ntags, tags, sample_tags) continue tag_types = [tag[1] for tag in tags] zipped = list(zip(tag_types, orig_tags)) sames = sum([tag[0] == tag[1] for tag in zipped]) # for tags in zipped: # if tags[0] in known_tags: # if tags[0] == tags[1]: # stats[n][tags[0]]['tp'] += 1 # else: # stats[n][tags[0]]['fp'] += 1 # elif tags[1] in known_tags: # stats[n][tags[1]]['fn'] += 1 # # for tag in known_tags: # if tag != tags[1]: # stats[n][tag]['tn'] += 1 totals[n]['same'] += sames totals[n]['time'] += elapsed full_predict_tags[n].extend(tag_types) Stats = namedtuple('Stats', ['accuracy', 'time', 'total_checked', 'confusion_matrix']) return [Stats( t['same'] / total_tags * 100, t['time'], # stats[i], total_tags, ConfusionMatrix(actual_vector=full_orig_tags, predict_vector=full_predict_tags[i]) ) for i, t in enumerate(totals)]
def clear(self): with timeit('Clearing model %s' % self.link._meta.db_table): # this doesnt reset identity and is slower... # self.link.objects.all().delete() self.db.execute('TRUNCATE TABLE %s RESTART IDENTITY' % self.link._meta.db_table)
'Migrate statuses between attestation_link tables (doesn\'t overwrite if a status already set)' ) parser.add_argument('from_table', help='Origin table') parser.add_argument('to_table', help='Target table') parser.add_argument('--debug', action='store_true', default=False, help='Show debug log messages') args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) config = Config(section='db') conn = psycopg2.connect(config['connection_url']) cur = conn.cursor() cur2 = conn.cursor() with timeit('SELECT', 5000): cur.execute( 'select status, kind, extras, pid, nmlid from %s where status != 0 AND status != 4' % args.from_table) processed = 0 total = cur.rowcount q = 'UPDATE ' + args.to_table + ' SET status=%s, kind=%s, extras=%s WHERE pid=%s AND nmlid=%s AND status=0' for idx, row in enumerate(tqdm(cur, total=total)): res = cur2.execute(q, row) processed += cur2.rowcount logger.debug('%d changed for: %s', cur2.rowcount, row) if idx % 10 == 0: with timeit('commit', 250): conn.commit()