def insert_doc(title, text, source): try: doc = Doc.objects.get(title=title) print('already exist -> pass') return except Doc.DoesNotExist: pass doc = Doc(title=title, text=text, source=source, type='v2') total = Doc.objects.count() doc.seq = total + 1 doc.save() import re regex = re.compile(r'\(Sent\d{1,4}\)') # from nltk import sent_tokenize for text in text.split('\n'): if len(text) == 0: continue index_str = regex.findall(text)[0] text = text.replace(index_str, '').strip() index = int(index_str.replace('(Sent', '').replace(')', '')) Sent(index=index, text=text, doc=doc).save()
def post_mturk_upload(): data = request.get_json() text = data['text'] doc_type = data['doc_type'] if 'turker_id' in data: turker_id = data['turker_id'] g.user.turker_id = turker_id g.user.save() from nltk.tokenize import sent_tokenize sents = sent_tokenize(text) doc = Doc(title='', text=text, source='mturk', type=doc_type) if 'source_url' in data: doc.source = data['source_url'] doc.save() res = { 'doc_id': str(doc.id), 'sents': list(), 'seq': doc.seq, 'title': doc.title, 'created_at': doc.created_at.isoformat(), } for index in range(0, len(sents)): sent = Sent(index=index, text=sents[index], doc=doc).save() res['sents'].append(sent.dump()) return json.dumps(res)
def duplicate_doc(from_type='v2', to_type='v3'): docs = Doc.objects(type=from_type).all() for doc in tqdm(docs): title = doc.title.replace('TARGET_ONLY', to_type) new_doc = Doc(title=title, text=doc.text, source=doc.source, type=to_type) new_doc.seq = Doc.objects.count() + 1 new_doc.save() sents = Sent.objects(doc=doc).all() for sent in sents: Sent(index=sent.index, text=sent.text, doc=new_doc).save()