def post_mturk_upload(): data = request.get_json() text = data['text'] doc_type = data['doc_type'] if 'turker_id' in data: turker_id = data['turker_id'] g.user.turker_id = turker_id g.user.save() from nltk.tokenize import sent_tokenize sents = sent_tokenize(text) doc = Doc(title='', text=text, source='mturk', type=doc_type) if 'source_url' in data: doc.source = data['source_url'] doc.save() res = { 'doc_id': str(doc.id), 'sents': list(), 'seq': doc.seq, 'title': doc.title, 'created_at': doc.created_at.isoformat(), } for index in range(0, len(sents)): sent = Sent(index=index, text=sents[index], doc=doc).save() res['sents'].append(sent.dump()) return json.dumps(res)
def insert_doc(title, text, source): try: doc = Doc.objects.get(title=title) print('already exist -> pass') return except Doc.DoesNotExist: pass doc = Doc(title=title, text=text, source=source, type='v2') total = Doc.objects.count() doc.seq = total + 1 doc.save() import re regex = re.compile(r'\(Sent\d{1,4}\)') # from nltk import sent_tokenize for text in text.split('\n'): if len(text) == 0: continue index_str = regex.findall(text)[0] text = text.replace(index_str, '').strip() index = int(index_str.replace('(Sent', '').replace(')', '')) Sent(index=index, text=text, doc=doc).save()
def submit(request): subject = request.POST['subject'] contents = request.POST['content'] writer = request.POST['writer'] flag = "none" post=Doc(dsubject=subject, contents=contents, writer=writer, flag=flag, reg_date=timezone.now()) post.save() return HttpResponseRedirect('/')
def testDocumentModel(self): with self.assertRaisesMessage( ValidationError, '(Document:None) (Field is required: [\'name\'])'): Doc(text=SAMPLE_TEXT).save() with self.assertRaisesMessage( ValidationError, '(Document:None) (StringField only accepts string values: [\'name\'])' ): Doc(name=1).save() with self.assertRaisesMessage( InvalidId, '\'piec\' is not a valid ObjectId, it must be a 12-byte input of type \'str\' or a 24-character hex string' ): Doc(id='piec', name='DOC_NAME').save()
def generate_encrypted_file(seq_id): from itertools import cycle def str_xor(s1, s2): result = [] for (c1, c2) in zip(s1, cycle(s2)): result.append(str(ord(c1) ^ ord(c2))) return ",".join(result) try: doc = Doc.objects().get(seq=seq_id) sents = Sent.objects(doc=doc).order_by('index') except Exception: return data = { 'doc_id': str(doc.id), 'title': doc.title, 'seq': doc.seq, 'sents': [], } for sent in sents: data['sents'].append(sent.dump()) data = json.dumps(data) data = str_xor(data, config.Config.ENCRYPTION_KEY) file_path = os.path.abspath( os.path.dirname(__file__) + '/../data/encrypted/#{}_{}'.format(seq_id, doc.title)) with open(file_path, 'w') as f: f.write(data)
def index_v2_page(doc_type): item_per_page = 50 page = request.args.get('p', 1) page = int(page) total = Doc.objects.filter(type=doc_type).count() total_page = math.ceil(total / item_per_page) paginator = Pagination(Doc.objects(type=doc_type).order_by('seq'), page, 50) docs = paginator.items docs_data = [] for doc in docs: item = doc.dump() item['sent_total'] = Sent.objects(doc=doc).count() item['progress'] = Annotation.objects(doc=doc, user=g.user, type='sentence').count() docs_data.append(item) pagination = { 'page': page, 'total_page': total_page, 'left': max(1, page - 5), 'right': min(page + 5, total_page), } return render_template('index.html', type=doc_type, docs=docs_data, g=g, pagination=pagination)
def testRemoveText(self): doc = Doc(name=DOC_NAME, last_change=datetime.datetime.now(), text=SAMPLE_TEXT).save() creation_date = doc.last_change remove_text(doc, 'Uzytkownik', 0) doc = Doc.objects(name=DOC_NAME)[0] logger.debug('TestDBManager::testRemoveText compare ' + str(creation_date) + ' and ' + str(doc.last_change)) self.assertTrue(creation_date < doc.last_change) self.assertTrue(doc.text[0] == ' ') Doc.objects.delete() doc = Doc(name=DOC_NAME, text="ok").save() remove_text(doc, 'o', 0) remove_text(doc, 'k', 0) self.assertTrue(Doc.objects(name=DOC_NAME)[0].text == '')
def testPolishCharacters(self): doc = Doc(name=DOC_NAME, last_change=datetime.datetime.now(), text=SAMPLE_TEXT).save() remove_text(doc, u'Użytkownik', 0) doc = Doc.objects(name=DOC_NAME)[0] self.assertTrue(doc.text[0] == ' ')
def testGetDocumentOr404(self): Doc(name=DOC_NAME, text=SAMPLE_TEXT).save() self.assertTrue( get_document_or_404(Doc, name=DOC_NAME).name == DOC_NAME) with self.assertRaisesMessage(Http404, 'No Document matches the given query.'): get_document_or_404(Doc, name='not_exisitng')
def testInsertText(self): doc = Doc(name=DOC_NAME, last_change=datetime.datetime.now(), text=SAMPLE_TEXT).save() creation_date = doc.last_change insert_text(doc, 'A', 3) doc = Doc.objects(name=DOC_NAME)[0] logger.debug('TestDBManager::testInsertText compare ' + str(creation_date) + ' and ' + str(doc.last_change)) self.assertTrue(creation_date < doc.last_change) self.assertTrue(doc.text[3] == 'A') Doc.objects.delete() doc = Doc(name=DOC_NAME, text="").save() insert_text(doc, 'k', 0) insert_text(doc, 'o', 0) self.assertTrue(Doc.objects(name=DOC_NAME)[0].text == 'ok')
def testHandleList(self): message = {} request = MockRequest() handle_list(message, request) self.assertEqual(len(message['files']), 0) doc = Doc(name=DOC_NAME, last_change=datetime.datetime.now(), text=SAMPLE_TEXT).save() handle_list(message, request) self.assertEqual(len(message['files']), 1) self.assertTrue(message['files'][0]['name'] == DOC_NAME) doc = Doc(name=DOC_NAME + '1', last_change=datetime.datetime.now(), text=SAMPLE_TEXT).save() handle_list(message, request) self.assertEqual(len(message['files']), 2)
def doc_migration(): docs = Doc.objects().all() for doc in tqdm(docs): if doc.mturk: doc.type = 'mturk' else: doc.type = 'v1' doc.save()
def delete_doc(doc_id): doc = Doc.objects().get(id=doc_id) sents = Sent.objects(doc=doc).order_by('index') for sent in sents: sent.delete() annotations = Sent.objects(doc=doc) for annotation in annotations: annotation.delete() doc.delete()
def testHandleMsg(self): doc = Doc(name=DOC_NAME, last_change=datetime.datetime.now(), text=SAMPLE_TEXT).save() handle_msg({'type': 'i', 'pos': 0, 'text': 'ala'}, doc['id']) doc = Doc.objects(name=DOC_NAME)[0] self.assertTrue(doc.text[:3] == 'ala') Doc.objects.delete() doc = Doc(name=DOC_NAME, last_change=datetime.datetime.now(), text=SAMPLE_TEXT).save() handle_msg({'type': 'r', 'pos': 0, 'text': u'Użytkownik'}, doc['id']) doc = Doc.objects(name=DOC_NAME)[0] print doc.text[0] self.assertTrue(doc.text[0] == ' ') with self.assertRaisesMessage(Http404, 'No Document matches the given query.'): handle_msg({}, 'not_exisitng')
def target_migration(): docs = Doc.objects().all() for doc in tqdm(docs): doc.text = doc.text.replace('<<TARGET>>', '(TARGET)') doc.save() sents = Sent.objects() for sent in tqdm(sents): sent.text = sent.text.replace('<<TARGET>>', '(TARGET)') sent.save()
def post_annotation(): data = request.get_json() doc = data['doc'] target_text = data['target_text'] index = data['index'] anchor_offset = data['anchor_offset'] focus_offset = data['focus_offset'] type = data['type'] basket = data['basket'] doc = Doc.objects().get(id=doc) sent = Sent.objects().get(doc=doc, index=index) user = g.user target_sent = Sent.objects().get(doc=doc, index=index) # In sentence, filter logic have to be changed if type == 'sentence': annotations = Annotation.objects.filter(doc=doc, sent=sent, index=index, user=g.user, type=type) else: annotations = Annotation.objects.filter(doc=doc, sent=sent, index=index, user=g.user, type=type, anchor_offset=anchor_offset) if annotations.count() > 0: annotation = annotations[0] else: annotation = Annotation(doc=doc, sent=sent, user=user, index=index, type=type, anchor_offset=anchor_offset) annotation.anchor_offset = anchor_offset annotation.focus_offset = focus_offset annotation.entire_text = target_sent.text annotation.target_text = target_text annotation.basket = basket annotation.ip = request.remote_addr annotation.save() return json.dumps({ 'annotation': annotation.dump(), })
def get_annotation(doc_id): try: doc = Doc.objects().get(id=doc_id) annotations = Annotation.objects(doc=doc, user=g.user) except Exception as e: return Response('not found', status=404) data = [] for annotation in annotations: data.append(annotation.dump()) return json.dumps({ 'annotations': data, })
def testLoginWindow(self): Doc(name=DOC_NAME, text=LOREM_IPSUM).save() driver = self.driver time.sleep(5) driver.get(self.base_url + "/") time.sleep(5) driver.find_element_by_css_selector("td").click() time.sleep(5) driver.find_element_by_id("gDriveIntegration").click() driver.find_element_by_id("authorizeGDriveLink").click() time.sleep(1) driver.switch_to_window(driver.window_handles[1]) self.assertTrue(u"Logowanie – Konta Google" == driver.title or u'Sign in - Google Accounts' == driver.title)
def testNewDocument(self): Doc(name=DOC_NAME + '1', text=LOREM_IPSUM).save() driver = self.driver time.sleep(1) driver.get(self.base_url) time.sleep(1) driver.find_element_by_css_selector("td").click() time.sleep(1) driver.find_element_by_id("newDocument").click() time.sleep(1) driver.find_element_by_id("documentName").clear() driver.find_element_by_id("documentName").send_keys(DOC_NAME) driver.find_element_by_id("saveDocumentButton").click() self.assertTrue( Doc.objects(name=DOC_NAME)[0]['text'] == EMPTY_DOC_STRING)
def testSaveAs(self): Doc(name=DOC_NAME + '1', text=LOREM_IPSUM).save() logger.debug('TestUI::testSaveAs documents: ' + str(Doc.objects())) driver = self.driver time.sleep(1) driver.get(self.base_url) time.sleep(1) driver.find_element_by_css_selector("td").click() time.sleep(1) driver.find_element_by_id("saveDocument").click() time.sleep(1) driver.find_element_by_id("documentName").clear() driver.find_element_by_id("documentName").send_keys(DOC_NAME) driver.find_element_by_id("saveDocumentButton").click() self.assertTrue(Doc.objects(name=DOC_NAME)[0]['text'] == LOREM_IPSUM)
def testRead(self): Doc.objects.delete() Doc(name=DOC_NAME, text=LOREM_IPSUM).save() driver = self.driver time.sleep(1) driver.get(self.base_url) time.sleep(1) driver.find_element_by_css_selector("td").click() time.sleep(1) driver.switch_to_frame("editorContent") time.sleep(5) content = driver.find_element_by_css_selector("#editorBody") try: self.assertEqual("Lorem ipsum.", content.text) except AssertionError as e: self.verificationErrors.append(str(e))
def duplicate_doc(from_type='v2', to_type='v3'): docs = Doc.objects(type=from_type).all() for doc in tqdm(docs): title = doc.title.replace('TARGET_ONLY', to_type) new_doc = Doc(title=title, text=doc.text, source=doc.source, type=to_type) new_doc.seq = Doc.objects.count() + 1 new_doc.save() sents = Sent.objects(doc=doc).all() for sent in sents: Sent(index=sent.index, text=sent.text, doc=new_doc).save()
def generate_encrypted_files(): docs = Doc.objects().all() for doc in tqdm(docs): if not (doc.type == 'v1' or doc.type == 'v2' or doc.type == 'v3'): continue generate_encrypted_file(seq_id=doc.seq)
def addinfo(request): if request.method == 'GET': list = Doc.objects.all() return render_to_response('admin/addinfo.html', {'lists': list}) if request.method == 'POST': title = request.POST.get('title') summary = request.POST.get('summary') source = request.POST.get('source') author = request.POST.get('author') time = request.POST.get('calendar') content = request.POST.get('content') success = '成功发布!' failure = '发布不成功,请确认信息填充完整后重新发布!' doc = Doc() doc.title = title doc.source = source doc.time = time doc.author = author doc.content = content if doc.summary == '': doc.summary = doc.content[0:30] else: doc.summary = summary if doc.title == '' or doc.source == '' or doc.author == '' or doc.title == '' or doc.content == '': return render_to_response('admin/failure.html', {'failure': failure}) else: doc.save() return render_to_response('admin/success.html', {'success': success})
def delete_doc_type(doc_type='v3'): docs = Doc.objects(type=doc_type).all() for doc in tqdm(docs): delete_doc(doc.id)