def refresh_khooshe_tiles(request, domain_name, indexed_path): core_name,_,_ = get_index_core(domain_name, indexed_path) numFound = GetIndexSize(core_name) is_in_queue = gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) if(is_in_queue): return HttpResponse(status=200, content="[{'msg':'Queued Khooshe generation'}]") else: return HttpResponse(status=200, content="[{'msg':'Can't queue another Khooshe generation'}]")
def add_crawled_index(request, domain_name, indexed_path, username, passwd): ''' Adds a new index in admin core. Storing username and password for future use ''' core_name,_,_ = get_index_core(domain_name, indexed_path, username, passwd) print "Created core ", core_name if(core_name): return HttpResponse(status=200, content="[{'msg':'success'}]") else: return HttpResponse(status=200, content="[{'msg':'failed'}]")
def refresh_khooshe_tiles(request, domain_name, indexed_path): core_name = get_index_core(domain_name, indexed_path) numFound = GetIndexSize(core_name) is_in_queue = gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) if (is_in_queue): return HttpResponse(status=200, content="[{'msg':'Queued Khooshe generation'}]") else: return HttpResponse( status=200, content="[{'msg':'Can't queue another Khooshe generation'}]")
def return_points_khooshe(request, indexed_path, domain_name): ''' Returns geo point for give file using khooshe ''' core_name,_,_ = get_index_core(domain_name, indexed_path) total_docs, points_count = get_idx_details(domain_name, indexed_path) file_name = ''.join(ch for ch in core_name if ch not in exclude) results = create_khooshe_result(GetIndexSize(core_name), total_docs, points_count, get_idx_field_csv(domain_name, indexed_path),"static/tiles/{0}".format(file_name) ) if results["rows_processed"]: return HttpResponse(status=200, content="[{0}]".format(results)) else: return HttpResponse(status=400, content="Cannot find latitude and longitude(return_points_khooshe).")
def search_crawled_index(request, indexed_path, domain_name, keyword): ''' Searches a 'keyword' in 'indexed_path', using 'username', 'passwd' ''' print "Searching for {0} in {1}".format(keyword, indexed_path) #Fetching stored data for domain name and index path from admin core_name, username, passwd = get_index_core(domain_name, indexed_path) keyword = urllib.quote_plus(keyword) url = "{0}/select?q=*{1}*&wt=json&rows=1".format(indexed_path, keyword) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) if r.status_code != 200: return HttpResponse(status=r.status_code, content=r.reason) response = r.json() numFound = response['response']['numFound'] list_id = [] print "Total number of records found {0}".format(numFound) # limiting search count to MAX_SEARCH_RESULT if numFound > MAX_SEARCH_RESULT: numFound = MAX_SEARCH_RESULT print "Processing only {0} records".format(numFound) for row in range(0, int(numFound), QUERY_RANGE): # loop solr query docs = {} url = "{0}/select?q=*{1}*&start={2}&rows={3}&wt=json&fl=id".format(indexed_path, keyword, row, QUERY_RANGE) print "solr query - {0}".format(url) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) response = r.json() docs = response['response']['docs'] list_id += [doc["id"] for doc in docs] khooshe_tile_folder_name,points_count = SearchLocalSolrIndex(core_name, list_id, keyword) result = create_khooshe_result(len(list_id), GetIndexSize(core_name), points_count, get_idx_field_csv(domain_name, indexed_path), khooshe_tile_folder_name) if khooshe_tile_folder_name: return HttpResponse(status=200, content="[{0}]".format(str(result))) else: return HttpResponse(status=404, content="No points found for given search")
def return_points_khooshe(request, indexed_path, domain_name): ''' Returns geo point for give file using khooshe ''' core_name = get_index_core(domain_name, indexed_path) results = {} results["rows_processed"] = GetIndexSize(core_name) results["total_docs"], results["points_count"] = get_idx_details(domain_name, indexed_path) exclude = set(string.punctuation) file_name = ''.join(ch for ch in core_name if ch not in exclude) results["khooshe_tile"] = "static/tiles/{0}".format(file_name) if results["rows_processed"]: return HttpResponse(status=200, content="[{0}]".format(results)) else: return HttpResponse(status=400, content="Cannot find latitude and longitude(return_points_khooshe).")
def return_points_khooshe(request, indexed_path, domain_name): ''' Returns geo point for give file using khooshe ''' core_name = get_index_core(domain_name, indexed_path) results = {} results["rows_processed"] = GetIndexSize(core_name) results["total_docs"], results["points_count"] = get_idx_details( domain_name, indexed_path) exclude = set(string.punctuation) file_name = ''.join(ch for ch in core_name if ch not in exclude) results["khooshe_tile"] = "static/tiles/{0}".format(file_name) if results["rows_processed"]: return HttpResponse(status=200, content="[{0}]".format(results)) else: return HttpResponse( status=400, content="Cannot find latitude and longitude(return_points_khooshe)." )
def query_crawled_index(request, domain_name, indexed_path): ''' To query crawled data that has been indexed into Solr or Elastichsearch and return location names ''' if "solr" in indexed_path.lower(): ''' Query admin core to get core information for domain_name, indexed_path combination ''' core_name, username, passwd = get_index_core(domain_name, indexed_path) print core_name if create_core(core_name): # 1 query solr QUERY_RANGE records at a time # 2 Run GeotopicParser on each doc one at a time # 3 keep appending results # 4 Save it in local solr instance rows_processed = 0 try: rows_processed = GetIndexSize(core_name) except: pass try: url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) if r.status_code != 200: return HttpResponse(status=r.status_code, content=r.reason) response = r.json() numFound = response['response']['numFound'] print "Total number of records to be geotagged {0}".format(numFound) #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) khooshe_gen_freq_l = rows_processed for row in range(rows_processed, int(numFound), QUERY_RANGE): # loop solr query if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE): print "Generating khooshe tiles.." gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ): khooshe_gen_freq_l += KHOOSHE_GEN_FREQ else: khooshe_gen_freq_l = (row + QUERY_RANGE) * 2 else: print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format(row,khooshe_gen_freq_l) docs = {} url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format(indexed_path, row, QUERY_RANGE) print "solr query - {0}".format(url) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) response = r.json() text = response['response']['docs'] docCount = 0 for t in text: # loop tika server starts points = [] try: docCount += 1 text_content = '' try: for v in t.values(): if(hasattr(v, '__iter__')): a = u' '.join(unicode(e) for e in v) elif(isinstance(v, unicode)): a = v.encode('ascii', 'ignore') else: a = str(v) text_content += ' ' + a.encode('ascii', 'ignore') except Exception as e: print traceback.format_exc() text_content = str(t.values()) # simplify text text_content = ' '.join(text_content.split()) parsed = callServer('put', TIKA_SERVER, '/rmeta', text_content, {'Accept': 'application/json', 'Content-Type' : 'application/geotopic'}, False) location_names = parse_lat_lon(eval(parsed[1])[0]) for key, values in location_names.iteritems(): try: # # TODO - ADD META DATA points.append( {'loc_name': smart_str(key), 'position':{ 'x': smart_str(values[0]), 'y': smart_str(values[1]) } } ) except Exception as e: print "Error while transforming points " print e pass print "Found {0} coordinates..".format(len(points)) # print docs except Exception as e: print traceback.format_exc() pass docs[str(t['id'])] = points # loop tika server ends status = IndexCrawledPoints(core_name, docs) print status # loop solr query ends gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) return HttpResponse(status=200, content= ("Crawled data geo parsed successfully.")) except Exception as e: print traceback.format_exc() print e return HttpResponse(status=500, content= ("Error while geo parsing crawled data.")) else: return HttpResponse(status=500, content= ("Only solr indexes supported for now"))
def query_crawled_index(request, domain_name, indexed_path, username, passwd): ''' To query crawled data that has been indexed into Solr or Elastichsearch and return location names ''' if "solr" in indexed_path.lower(): ''' Query admin core to get core information for domain_name, indexed_path combination ''' core_name = get_index_core(domain_name, indexed_path) print core_name if create_core(core_name): # 1 query solr QUERY_RANGE records at a time # 2 Run GeotopicParser on each doc one at a time # 3 keep appending results # 4 Save it in local solr instance rows_processed = 0 try: rows_processed = GetIndexSize(core_name) except: pass try: url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) if r.status_code != 200: return HttpResponse(status=r.status_code, content=r.reason) response = r.json() numFound = response['response']['numFound'] print "Total number of records to be geotagged {0}".format( numFound) #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) khooshe_gen_freq_l = rows_processed for row in range(rows_processed, int(numFound), QUERY_RANGE): # loop solr query if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE): print "Generating khooshe tiles.." gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ): khooshe_gen_freq_l += KHOOSHE_GEN_FREQ else: khooshe_gen_freq_l = (row + QUERY_RANGE) * 2 else: print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format( row, khooshe_gen_freq_l) docs = {} url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format( indexed_path, row, QUERY_RANGE) print "solr query - {0}".format(url) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) response = r.json() text = response['response']['docs'] docCount = 0 for t in text: # loop tika server starts points = [] try: docCount += 1 text_content = '' try: for v in t.values(): if (hasattr(v, '__iter__')): a = u' '.join(unicode(e) for e in v) elif (isinstance(v, unicode)): a = v.encode('ascii', 'ignore') else: a = str(v) text_content += a.encode('ascii', 'ignore') except Exception as e: print traceback.format_exc() text_content = str(t.values()) # simplify text text_content = ' '.join(text_content.split()) parsed = callServer( 'put', TIKA_SERVER, '/rmeta', text_content, { 'Accept': 'application/json', 'Content-Type': 'application/geotopic' }, False) location_names = parse_lat_lon(eval(parsed[1])[0]) for key, values in location_names.iteritems(): try: # # TODO - ADD META DATA points.append({ 'loc_name': smart_str(key), 'position': { 'x': smart_str(values[0]), 'y': smart_str(values[1]) } }) except Exception as e: print "Error while transforming points " print e pass print "Found {0} coordinates..".format(len(points)) # print docs except Exception as e: print traceback.format_exc() pass docs[str(t['doi'])] = points # loop tika server ends status = IndexCrawledPoints(core_name, docs) print status # loop solr query ends gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) return HttpResponse( status=200, content=("Crawled data geo parsed successfully.")) except Exception as e: print traceback.format_exc() print e return HttpResponse( status=500, content=("Error while geo parsing crawled data.")) else: return HttpResponse(status=500, content=("Only solr indexes supported for now"))