def get_documents():
    global doc_size
    doc_size = query_doc_size("-section_name_attrs:*", solr_url=solr_url, mapper_inst=util.report_mapper_inst,
                              mapper_key=util.report_mapper_key, mapper_url=util.report_mapper_url)

    i = 0
    while i < (doc_size + 20):
        print("on batch %d" % i)
        pre_compute(i)

        i += batch_size
Ejemplo n.º 2
0
def get_documents(q):
    global doc_size
    doc_size = query_doc_size(q, solr_url=solr_url, mapper_inst=util.report_mapper_inst,
                              mapper_key=util.report_mapper_key, mapper_url=util.report_mapper_url)

    i = 0
    while i < doc_size:
        print("on batch %d" % i)
        do_something(q, i)

        i += batch_size
Ejemplo n.º 3
0
def get_documents():
    global doc_size
    doc_size = query_doc_size("*:*", solr_url=solr_url, mapper_inst=util.report_mapper_inst,
                              mapper_key=util.report_mapper_key, mapper_url=util.report_mapper_url)

    i = 0
    while i < doc_size:
        print("on batch %d" % i)
        pre_compute(i)

        i += batch_size
Ejemplo n.º 4
0
def pre_compute(n):
    try:
        docs = query("-sentence_attrs:*", solr_url=solr_url, mapper_inst=util.report_mapper_inst,
                     mapper_key=util.report_mapper_key, sort="source DESC",
                     mapper_url=util.report_mapper_url, start=n, rows=batch_size)
        updated_docs = list()
        ids = list()
        for doc in docs:
            txt = document_text(doc, clean=True)
            updates = False
            if sentences_key not in doc:
                sentences = document_sentences(txt)
                doc[sentences_key] = sentences
                updates = True

            if section_names_key not in doc:
                section_headers, section_texts = [UNKNOWN], [txt]
                try:
                    section_headers, section_texts = sec_tag_process(txt)
                except Exception as e:
                    print(e)
                names = [x.concept for x in section_headers]
                doc[section_names_key] = names
                doc[section_text_key] = section_texts
                updates = True

            if updates:
                ids.append(doc[util.solr_report_id_field])
                updated_docs.append(doc)

        print('updating the following docs: ', ids)
        if n % 10 == 0:
            print("******************************")
            done_doc_size = query_doc_size("sentence_attrs:*", solr_url=solr_url, mapper_inst=util.report_mapper_inst,
                                      mapper_key=util.report_mapper_key, mapper_url=util.report_mapper_url)

            pct = (float(done_doc_size) / float(doc_size)) * 100.0
            print("updated overall: %d/%d (%f pct)" % (done_doc_size, doc_size, pct))
            print("******************************")
        data = json.dumps(updated_docs)
        response2 = requests.post(url, headers=headers, data=data)

        if response2.status_code == 200:
            print('success!!!')
        else:
            print('fail: ', response2.reason)
            print(response2.content)
            retry(updated_docs)
    except Exception as ex:
        print('exception updating docs')
        return False
    return True