コード例 #1
0
def main(args):
    init_django()

    client = Client(get_project_by_id(id=args.project_id))

    docs = [json.loads(text) for text in sys.stdin.readlines()]
    if args.nodups:
        existing_docs = set(client.get_doc_texts())
    else:
        existing_docs = []

    now = datetime.datetime.utcnow()

    added = 0
    for doc in tqdm(docs):
        if doc['text'] in existing_docs:
            # skip the document, it was already added
            continue
        client.add_doc(doc['text'],
                       doc.get('meta', {}),
                       doc['labels'],
                       priority=1000,
                       updated=now)
        added += 1
    print("Added:", added)
コード例 #2
0
ファイル: dump.py プロジェクト: buriy/active_ner
def main(args):
    init_django()

    project = get_project_by_id(id=args.project_id)
    client = Client(project)
    docs = client.get_approved_docs()

    for doc in docs:
        print(json.dumps(doc, ensure_ascii=False))
コード例 #3
0
ファイル: learn.py プロジェクト: entn-at/active_ner
class Learner:
    def __init__(self, project):
        self.client = Client(project)

    def run(self, texts, max_add=None):
        labels = self.client.get_labels()
        docs = self.client.get_docs()
        print("Docs to train:", len(docs))
        nlp = train_model(labels, docs)
        results = get_predictions(nlp, texts)
        # print([r['predicts'] for r in results])
        results = sorted(results, key=lambda x: x['unsure'], reverse=True)
        self.client.del_unapproved(max_add)
        self.client.add_docs(results, max_add=max_add)
コード例 #4
0
ファイル: update.py プロジェクト: buriy/active_ner
def main(args):
    init_django()
    project = get_project_by_id(id=args.project_id)
    while True:
        client = Client(project)
        model = train(client)

        docs = client.get_unapproved_docs(with_anno=False,
                                          limit=args.max_update)
        results = get_predictions(model, docs)
        results = sorted(results, key=lambda x: x['unsure'], reverse=True)

        now = datetime.datetime.utcnow()

        for r in tqdm(results, desc="Updating DB"):
            doc = r['document']
            priority = int(1000 - r['unsure'] *
                           1000)  # 0 is the most urgent, 1 is the least urgent
            updated = now - datetime.timedelta(seconds=priority)
            # print("Predicted labels:", r['labels'])
            status = client.update_doc(id=doc['id'],
                                       labels=r['labels'],
                                       priority=priority,
                                       updated=updated)
            if not status:
                print("Document", doc['id'],
                      "was already marked as annotated. Skipping.")

        if not args.watch:
            break
        approved_count = client.get_unapproved_doc_count()
        while True:
            time.sleep(1)
            new_count = client.get_unapproved_doc_count()
            if new_count != approved_count:
                print("Unapproved documents count changed, now we have:",
                      new_count)
                break
コード例 #5
0
ファイル: learn.py プロジェクト: entn-at/active_ner
 def __init__(self, project):
     self.client = Client(project)
コード例 #6
0
ファイル: fix_unapproved.py プロジェクト: buriy/active_ner
def main(args):
    init_django()

    client = Client(get_project_by_id(id=args.project_id))
    client.fix_unapproved()
コード例 #7
0
ファイル: del.py プロジェクト: buriy/active_ner
def main(args):
    init_django()
    client = Client(get_project_by_id(id=args.project_id))
    deleted = client.del_unapproved(args.delete_count)
    print("Deleted:", deleted)