Beispiel #1
0
    def connect(self):
        if self._connected:
            return
        self._db = create_engine(self._connection_url)
        with timeit('connect db %s' % self._debug_connection_url, 1000):
            self._db.connect()

        self._connected = True
        type=int,
        help='Test the tagger with TEST known examples (default 500)')
    parser.add_argument('--test-mediahaven',
                        action='store_true',
                        help='Tag MediaHaven newspaper OCR-text')
    parser.add_argument('--profile',
                        action='store_true',
                        help='Output run times of some key operations')
    parser.add_argument(dest='pickle', help='Filename of pickle file')
    args = parser.parse_args()
    if not args.profile:
        logging.getLogger('pythonmodules.profiling').setLevel(logging.ERROR)

    samples = Samples(GMB())
    if args.train:
        with timeit('Creating NamedEntityChunker'):
            chunker = NamedEntityChunker(samples.training())
        pickle.dump(chunker, open(args.pickle, 'wb'))
    else:
        with timeit('Pickle load'):
            chunker = pickle.load(open(args.pickle, 'rb'))

    if args.test_mediahaven:
        with timeit('NER Tagging'):
            from pythonmodules.mediahaven import MediaHaven

            # from pythonmodules.config import Config
            mh = MediaHaven()
            item = mh.one('+(workflow:GMS) +(archiveStatus:on_tape)')
            print(chunker.parse(pos_tag(word_tokenize(item['description']))))
    help='Extra where clause to pass to the select query (eg. "status=1")')
args = parser.parse_args()

logging.basicConfig()
logger = logging.getLogger()
fh = logging.FileHandler(args.log_file)
fh.setLevel(logging.INFO)
logger.addHandler(fh)

_solr = Solr(Config(section='solr')['url'])

config = Config(section='db')
conn = psycopg2.connect(config['connection_url'])
conn2 = psycopg2.connect(config['connection_url'])
cur = conn.cursor()
with timeit('SELECT', 5000):
    where_clause = ('WHERE %s' % args.where) if args.where else ''
    q = 'SELECT pid, nmlid, entity, id, score FROM %s %s ORDER BY pid ASC' % (
        args.table, where_clause)
    if args.limit:
        q += ' LIMIT %d' % int(args.limit)
    if args.start:
        q += ' OFFSET %d' % int(args.start)
    cur.execute(q)


@multithreaded(10, pre_start=True, pass_thread_id=False)
def process(row):
    try:
        res = _solr.search('id:%s' % row[0], rows=1, fl=['text', 'language'])
Beispiel #4
0
def process_pid(row):
    try:
        id_, full_pid, external_id, entity, score, meta = row
        entity = remove_double_spaces.sub(' ', entity)
        with timeit('Rater init', 1e3):
            rater = Rater(full_pid, external_id, entity)
        cur_rating = score
        meta_old = None
        if meta:
            meta_old = meta

        try:
            meta = get_meta(full_pid, external_id, entity, score, meta)
        except KeyError as e:
            logger.warning(e)
            with conn.cursor() as cur2:
                cur2.execute(
                    'UPDATE ' + args.table + ' SET status=4 WHERE id=%s',
                    [id_])
                # conn.commit()

        new_score = 0
        try:
            rating = rater.ratings()
            meta['rating_breakdown'] = {
                k: rating.scores[k].rating
                for k in rating.scores
            }
            meta['rating_multiplier'] = rating.total_multiplier
            new_score = rating.total
        except KeyError as e:
            logger.warning(e)

        toset = dict()
        if not isclose(cur_rating, new_score, abs_tol=.0001):
            toset['score'] = new_score

        if 'quality' in meta:
            del meta['quality']
        meta = json.dumps(meta)

        if meta_to_comp_meta(meta) != meta_to_comp_meta(meta_old):
            toset['meta'] = meta

        if not len(toset):
            return

        with timeit('SLOW UPDATE %s' % id_, 1e3), conn.cursor() as cur2:
            keys = ','.join([k + ' = %s' for k in toset.keys()])
            values = [toset[k] for k in toset.keys()]
            values.append(id_)
            cur2.execute(
                'UPDATE ' + args.table + ' SET ' + keys + ' WHERE id=%s',
                values)
            # conn2.commit()
    except Exception as e:
        try:
            url = 'http://do-tst-mke-01.do.viaa.be/attestation/info/model-%s/%s/%s/%s' % \
                  (model_name, full_pid, external_id, entity.replace(' ', '/'))
        except Exception as e2:
            url = str(e2)
        logger.warning('exception for %s', url)
        logger.exception(e)
Beispiel #5
0
    def connect(self):
        super().connect()

        with timeit('reflect db %s' % self._debug_connection_url, 1000):
            self._meta = MetaData(bind=self._db)
            self._meta.reflect()
Beispiel #6
0
    def test(self, amount=500):
        # known_tags = NER.allowed_tags
        samples = self.corpus.read_entities()

        if amount == 0:
            logger.info("Loading the entire corpus in memory, this may take a while...")
            # preload all samples to have an amount available for progress indicator
            samples = list(samples)
            amount = len(samples)

        totals = [defaultdict(lambda: 0) for i in range(len(self.taggers))]
        # stats = [defaultdict(lambda: defaultdict(lambda: 0)) for i in range(len(self.taggers))]

        total_tags = 0
        progress = tqdm(total=amount*len(self.taggers))
        timer = timeit()
        full_orig_tags = []
        full_predict_tags = [[] for i in range(len(self.taggers))]

        for sample_index, sample in enumerate(samples):
            if amount and sample_index >= amount:
                break
            sample_tags = Tester.filter_tags(sample.entities)
            orig_tags = [tag[1] for tag in sample_tags]
            ntags = len(orig_tags)
            total_tags += ntags
            full_orig_tags.extend(orig_tags)
            for n, tagger in enumerate(self.taggers):
                progress.update()
                cls = type(tagger).__name__
                timer.restart()
                tags = Tester.filter_tags(tagger.tag(sample.phrase))
                elapsed = timer.elapsed()

                # check and do a naive attempt to fix different tag lengths
                if len(tags) != ntags:
                    tags = Tester.fix_tags_counts(tags, sample_tags)
                    if len(tags) != ntags:
                        logger.error("Samples are of different size for %s (%d vs %d): \nTAGGER:   %s\nORIGINAL: %s",
                                     cls, len(tags), ntags, tags, sample_tags)
                        continue

                tag_types = [tag[1] for tag in tags]
                zipped = list(zip(tag_types, orig_tags))
                sames = sum([tag[0] == tag[1] for tag in zipped])

                # for tags in zipped:
                #     if tags[0] in known_tags:
                #         if tags[0] == tags[1]:
                #             stats[n][tags[0]]['tp'] += 1
                #         else:
                #             stats[n][tags[0]]['fp'] += 1
                #     elif tags[1] in known_tags:
                #         stats[n][tags[1]]['fn'] += 1
                #
                #     for tag in known_tags:
                #         if tag != tags[1]:
                #             stats[n][tag]['tn'] += 1

                totals[n]['same'] += sames
                totals[n]['time'] += elapsed
                full_predict_tags[n].extend(tag_types)

        Stats = namedtuple('Stats', ['accuracy', 'time', 'total_checked', 'confusion_matrix'])
        return [Stats(
                    t['same'] / total_tags * 100,
                    t['time'],
                    # stats[i],
                    total_tags,
                    ConfusionMatrix(actual_vector=full_orig_tags, predict_vector=full_predict_tags[i])
                ) for i, t in enumerate(totals)]
Beispiel #7
0
 def clear(self):
     with timeit('Clearing model %s' % self.link._meta.db_table):
         # this doesnt reset identity and is slower...
         # self.link.objects.all().delete()
         self.db.execute('TRUNCATE TABLE %s RESTART IDENTITY' %
                         self.link._meta.db_table)
    'Migrate statuses between attestation_link tables (doesn\'t overwrite if a status already set)'
)
parser.add_argument('from_table', help='Origin table')
parser.add_argument('to_table', help='Target table')
parser.add_argument('--debug',
                    action='store_true',
                    default=False,
                    help='Show debug log messages')
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)

config = Config(section='db')
conn = psycopg2.connect(config['connection_url'])
cur = conn.cursor()
cur2 = conn.cursor()
with timeit('SELECT', 5000):
    cur.execute(
        'select status, kind, extras, pid, nmlid from %s where status != 0 AND status != 4'
        % args.from_table)

processed = 0
total = cur.rowcount
q = 'UPDATE ' + args.to_table + ' SET status=%s, kind=%s, extras=%s WHERE pid=%s AND nmlid=%s AND status=0'
for idx, row in enumerate(tqdm(cur, total=total)):
    res = cur2.execute(q, row)
    processed += cur2.rowcount
    logger.debug('%d changed for: %s', cur2.rowcount, row)
    if idx % 10 == 0:
        with timeit('commit', 250):
            conn.commit()