def get_all_domain_details(): resp = {} if create_core(ADMIN_CORE): url = '{0}{1}/select?q=*&wt=json'.format(SOLR_URL, ADMIN_CORE) response = requests.get(url, headers=headers) response = yaml.safe_load(response.text)['response']['docs'] for doc in response: resp[doc["id"]] = doc[ADMIN_F_IDX_LIST] return resp
def get_idx_field_csv(domain, index_path): ''' Returns field_csv from original index to be shown on popups ''' if create_core(ADMIN_CORE): response = _get_domain_admin(domain)['response']['docs'][0] all_idx = response[ADMIN_F_IDX_LIST] if (index_path in all_idx): index_arr = all_idx.index(index_path) return response[ADMIN_F_IDX_FIELD_LIST][index_arr] return 0, 0
def get_index_core(domain, index_path): # TODO strip trailing / if create_core(ADMIN_CORE): response = _get_domain_admin(domain) num_found = response['response']['numFound'] if (num_found == 0): # # No record found for this domain. count = 1 # # Initialize a new one else: # Check if this index exist for this domain all_idx = response['response']['docs'][0][ADMIN_F_IDX_LIST] if (index_path in all_idx): index_arr = all_idx.index(index_path) core_name = response['response']['docs'][0][ADMIN_F_CORE_LIST][ index_arr] return core_name # if not create a new count for this index count = len(all_idx) + 1 # get unique core name core_name = "{0}_{1}".format(domain, count) payload = { "add": { "doc": { "id": "{0}".format(domain), ADMIN_F_IDX_LIST: { "add": "{0}".format(index_path) }, ADMIN_F_CORE_LIST: { "add": core_name }, ADMIN_F_PNT_LEN_LIST: { "add": 0 }, ADMIN_F_IDX_SIZE_LIST: { "add": 0 } } } } r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE), data=str(payload), params=params, headers=headers) print r.text if (not r.ok): raise "Can't create core with core name {0}".format(core_name) return core_name
def get_idx_details(domain, index_path): ''' Return size of original index and number of points found till now ''' if create_core(ADMIN_CORE): response = _get_domain_admin(domain)['response']['docs'][0] all_idx = response[ADMIN_F_IDX_LIST] if (index_path in all_idx): index_arr = all_idx.index(index_path) return response[ADMIN_F_IDX_SIZE_LIST][index_arr], response[ ADMIN_F_PNT_LEN_LIST][index_arr] return 0, 0
def delete_index_core(domain, index_path): # TODO strip trailing / if create_core(ADMIN_CORE): response = _get_domain_admin(domain) num_found = response['response']['numFound'] if (num_found == 0): # # No record found for this domain. return "No domain added with name - " + domain else: # Check if this index exist for this domain all_idx = response['response']['docs'][0][ADMIN_F_IDX_LIST] if (index_path not in all_idx): return "No index added with name {0} for domain {1} ".format( index_path, domain) index_in_arr = all_idx.index(index_path) new_doc = response['response']['docs'][0] print "Data now -", new_doc del (new_doc[ADMIN_F_IDX_LIST][index_in_arr]) del (new_doc[ADMIN_F_CORE_LIST][index_in_arr]) del (new_doc[ADMIN_F_PNT_LEN_LIST][index_in_arr]) del (new_doc[ADMIN_F_IDX_SIZE_LIST][index_in_arr]) del (new_doc[ADMIN_F_IDX_FIELD_LIST][index_in_arr]) del (new_doc[ADMIN_F_USER_LIST][index_in_arr]) del (new_doc[ADMIN_F_PASSWD_LIST][index_in_arr]) print "Updated data - ", new_doc payload = {"add": {"doc": new_doc}} r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE), data=str(payload), params=params, headers=headers) print r.text if (not r.ok): print "Can't delete index with name {0} for domain {1} ".format( index_path, domain) else: print "Deleted index with name {0} for domain {1} ".format( index_path, domain)
def update_idx_details(domain, index_path, idx_size, pnt_size): ''' Updates size of original index and number of points found till now ''' if create_core(ADMIN_CORE): response = _get_domain_admin(domain)['response']['docs'][0] all_idx = response[ADMIN_F_IDX_LIST] if (index_path in all_idx): index_arr = all_idx.index(index_path) response[ADMIN_F_PNT_LEN_LIST][index_arr] = pnt_size response[ADMIN_F_IDX_SIZE_LIST][index_arr] = idx_size payload = { "add": { "doc": { "id": "{0}".format(domain), ADMIN_F_PNT_LEN_LIST: { "set": response[ADMIN_F_PNT_LEN_LIST] }, ADMIN_F_IDX_SIZE_LIST: { "set": response[ADMIN_F_IDX_SIZE_LIST] } } } } r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE), data=str(payload), params=params, headers=headers) print r.text if (not r.ok): print payload raise "Can't update idx details with core name {0} - {1}".format( domain, index_path) return True return False
def update_idx_field_csv(domain, index_path, idx_field_csv): ''' Updates field_csv from original index to be shown on popups ''' if create_core(ADMIN_CORE): response = _get_domain_admin(domain)['response']['docs'] if len(response) == 1: response = response[0] all_idx = response[ADMIN_F_IDX_LIST] if (index_path in all_idx): index_arr = all_idx.index(index_path) response[ADMIN_F_IDX_FIELD_LIST][index_arr] = "{0}".format( idx_field_csv) payload = { "add": { "doc": { "id": "{0}".format(domain), ADMIN_F_IDX_FIELD_LIST: { "set": response[ADMIN_F_IDX_FIELD_LIST] } } } } r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE), data=str(payload), params=params, headers=headers) print r.text if (not r.ok): print payload raise "Can't update idx details with core name {0} - {1}".format( domain, index_path) return True # return False
def query_crawled_index(request, domain_name, indexed_path, username, passwd): ''' To query crawled data that has been indexed into Solr or Elastichsearch and return location names ''' if "solr" in indexed_path.lower(): ''' Query admin core to get core information for domain_name, indexed_path combination ''' core_name = get_index_core(domain_name, indexed_path) print core_name if create_core(core_name): # 1 query solr QUERY_RANGE records at a time # 2 Run GeotopicParser on each doc one at a time # 3 keep appending results # 4 Save it in local solr instance rows_processed = 0 try: rows_processed = GetIndexSize(core_name) except: pass try: url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) if r.status_code != 200: return HttpResponse(status=r.status_code, content=r.reason) response = r.json() numFound = response['response']['numFound'] print "Total number of records to be geotagged {0}".format( numFound) #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) khooshe_gen_freq_l = rows_processed for row in range(rows_processed, int(numFound), QUERY_RANGE): # loop solr query if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE): print "Generating khooshe tiles.." gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ): khooshe_gen_freq_l += KHOOSHE_GEN_FREQ else: khooshe_gen_freq_l = (row + QUERY_RANGE) * 2 else: print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format( row, khooshe_gen_freq_l) docs = {} url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format( indexed_path, row, QUERY_RANGE) print "solr query - {0}".format(url) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) response = r.json() text = response['response']['docs'] docCount = 0 for t in text: # loop tika server starts points = [] try: docCount += 1 text_content = '' try: for v in t.values(): if (hasattr(v, '__iter__')): a = u' '.join(unicode(e) for e in v) elif (isinstance(v, unicode)): a = v.encode('ascii', 'ignore') else: a = str(v) text_content += a.encode('ascii', 'ignore') except Exception as e: print traceback.format_exc() text_content = str(t.values()) # simplify text text_content = ' '.join(text_content.split()) parsed = callServer( 'put', TIKA_SERVER, '/rmeta', text_content, { 'Accept': 'application/json', 'Content-Type': 'application/geotopic' }, False) location_names = parse_lat_lon(eval(parsed[1])[0]) for key, values in location_names.iteritems(): try: # # TODO - ADD META DATA points.append({ 'loc_name': smart_str(key), 'position': { 'x': smart_str(values[0]), 'y': smart_str(values[1]) } }) except Exception as e: print "Error while transforming points " print e pass print "Found {0} coordinates..".format(len(points)) # print docs except Exception as e: print traceback.format_exc() pass docs[str(t['doi'])] = points # loop tika server ends status = IndexCrawledPoints(core_name, docs) print status # loop solr query ends gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) return HttpResponse( status=200, content=("Crawled data geo parsed successfully.")) except Exception as e: print traceback.format_exc() print e return HttpResponse( status=500, content=("Error while geo parsing crawled data.")) else: return HttpResponse(status=500, content=("Only solr indexes supported for now"))
def get_index_core(domain, index_path, user="******", passwd="pass"): # TODO strip trailing / if create_core(ADMIN_CORE): response = _get_domain_admin(domain) num_found = response['response']['numFound'] if (num_found == 0): # # No record found for this domain. count = 1 # # Initialize a new one else: # Check if this index exist for this domain all_idx = response['response']['docs'][0][ADMIN_F_IDX_LIST] count = response['response']['docs'][0][ADMIN_F_COUNT][0] if (index_path in all_idx): index_arr = all_idx.index(index_path) core_name = response['response']['docs'][0][ADMIN_F_CORE_LIST][ index_arr] # todo encrypt it stored_user = response['response']['docs'][0][ ADMIN_F_USER_LIST][index_arr] stored_passwd = response['response']['docs'][0][ ADMIN_F_PASSWD_LIST][index_arr] # return existing core name with user name and passwprd return core_name, stored_user, crypter.decrypt(stored_passwd) # if not create a new count for this index print "No existing core found for ", domain, index_path count = count + 1 # get unique core name core_name = "{0}_{1}".format(domain, count) payload = { "add": { "doc": { "id": "{0}".format(domain), ADMIN_F_IDX_LIST: { "add": "{0}".format(index_path) }, ADMIN_F_CORE_LIST: { "add": core_name }, ADMIN_F_PNT_LEN_LIST: { "add": 0 }, ADMIN_F_IDX_SIZE_LIST: { "add": 0 }, ADMIN_F_IDX_FIELD_LIST: { "add": DEFAULT_IDX_FIELD }, ADMIN_F_USER_LIST: { "add": "{0}".format(user) }, ADMIN_F_PASSWD_LIST: { "add": "{0}".format(crypter.encrypt(passwd)) }, ADMIN_F_COUNT: { "set": count } } } } r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE), data=str(payload), params=params, headers=headers) print r.text if (not r.ok): raise "Can't create core with core name {0}".format(core_name) return # return newly created core return core_name, user, passwd