Example #1
0
def insert_doc(title, text, source):
    try:
        doc = Doc.objects.get(title=title)
        print('already exist -> pass')
        return
    except Doc.DoesNotExist:
        pass

    doc = Doc(title=title, text=text, source=source, type='v2')
    total = Doc.objects.count()
    doc.seq = total + 1
    doc.save()

    import re
    regex = re.compile(r'\(Sent\d{1,4}\)')

    # from nltk import sent_tokenize
    for text in text.split('\n'):
        if len(text) == 0:
            continue

        index_str = regex.findall(text)[0]
        text = text.replace(index_str, '').strip()
        index = int(index_str.replace('(Sent', '').replace(')', ''))

        Sent(index=index, text=text, doc=doc).save()
Example #2
0
def post_mturk_upload():
    data = request.get_json()
    text = data['text']
    doc_type = data['doc_type']

    if 'turker_id' in data:
        turker_id = data['turker_id']

        g.user.turker_id = turker_id
        g.user.save()

    from nltk.tokenize import sent_tokenize
    sents = sent_tokenize(text)

    doc = Doc(title='', text=text, source='mturk', type=doc_type)
    if 'source_url' in data:
        doc.source = data['source_url']
    doc.save()

    res = {
        'doc_id': str(doc.id),
        'sents': list(),
        'seq': doc.seq,
        'title': doc.title,
        'created_at': doc.created_at.isoformat(),
    }
    for index in range(0, len(sents)):
        sent = Sent(index=index, text=sents[index], doc=doc).save()
        res['sents'].append(sent.dump())

    return json.dumps(res)
Example #3
0
def duplicate_doc(from_type='v2', to_type='v3'):
    docs = Doc.objects(type=from_type).all()
    for doc in tqdm(docs):
        title = doc.title.replace('TARGET_ONLY', to_type)
        new_doc = Doc(title=title,
                      text=doc.text,
                      source=doc.source,
                      type=to_type)
        new_doc.seq = Doc.objects.count() + 1
        new_doc.save()

        sents = Sent.objects(doc=doc).all()
        for sent in sents:
            Sent(index=sent.index, text=sent.text, doc=new_doc).save()