def get_documents(): global doc_size doc_size = query_doc_size("-section_name_attrs:*", solr_url=solr_url, mapper_inst=util.report_mapper_inst, mapper_key=util.report_mapper_key, mapper_url=util.report_mapper_url) i = 0 while i < (doc_size + 20): print("on batch %d" % i) pre_compute(i) i += batch_size
def get_documents(q): global doc_size doc_size = query_doc_size(q, solr_url=solr_url, mapper_inst=util.report_mapper_inst, mapper_key=util.report_mapper_key, mapper_url=util.report_mapper_url) i = 0 while i < doc_size: print("on batch %d" % i) do_something(q, i) i += batch_size
def get_documents(): global doc_size doc_size = query_doc_size("*:*", solr_url=solr_url, mapper_inst=util.report_mapper_inst, mapper_key=util.report_mapper_key, mapper_url=util.report_mapper_url) i = 0 while i < doc_size: print("on batch %d" % i) pre_compute(i) i += batch_size
def pre_compute(n): try: docs = query("-sentence_attrs:*", solr_url=solr_url, mapper_inst=util.report_mapper_inst, mapper_key=util.report_mapper_key, sort="source DESC", mapper_url=util.report_mapper_url, start=n, rows=batch_size) updated_docs = list() ids = list() for doc in docs: txt = document_text(doc, clean=True) updates = False if sentences_key not in doc: sentences = document_sentences(txt) doc[sentences_key] = sentences updates = True if section_names_key not in doc: section_headers, section_texts = [UNKNOWN], [txt] try: section_headers, section_texts = sec_tag_process(txt) except Exception as e: print(e) names = [x.concept for x in section_headers] doc[section_names_key] = names doc[section_text_key] = section_texts updates = True if updates: ids.append(doc[util.solr_report_id_field]) updated_docs.append(doc) print('updating the following docs: ', ids) if n % 10 == 0: print("******************************") done_doc_size = query_doc_size("sentence_attrs:*", solr_url=solr_url, mapper_inst=util.report_mapper_inst, mapper_key=util.report_mapper_key, mapper_url=util.report_mapper_url) pct = (float(done_doc_size) / float(doc_size)) * 100.0 print("updated overall: %d/%d (%f pct)" % (done_doc_size, doc_size, pct)) print("******************************") data = json.dumps(updated_docs) response2 = requests.post(url, headers=headers, data=data) if response2.status_code == 200: print('success!!!') else: print('fail: ', response2.reason) print(response2.content) retry(updated_docs) except Exception as ex: print('exception updating docs') return False return True