Exemple #1
0
def refresh_khooshe_tiles(request, domain_name, indexed_path):
    core_name,_,_ = get_index_core(domain_name, indexed_path)
    numFound = GetIndexSize(core_name)
    is_in_queue = gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
    if(is_in_queue):
        return HttpResponse(status=200, content="[{'msg':'Queued Khooshe generation'}]")
    else:
        return HttpResponse(status=200, content="[{'msg':'Can't queue another Khooshe generation'}]")
Exemple #2
0
def add_crawled_index(request, domain_name, indexed_path, username, passwd):
    '''
        Adds a new index in admin core. Storing username and password for future use
    '''
    core_name,_,_ = get_index_core(domain_name, indexed_path, username, passwd)
    print "Created core ", core_name
    
    if(core_name):
        return HttpResponse(status=200, content="[{'msg':'success'}]")
    else:
        return HttpResponse(status=200, content="[{'msg':'failed'}]")
Exemple #3
0
def refresh_khooshe_tiles(request, domain_name, indexed_path):
    core_name = get_index_core(domain_name, indexed_path)
    numFound = GetIndexSize(core_name)
    is_in_queue = gen_khooshe_update_admin(core_name, domain_name,
                                           indexed_path, numFound)
    if (is_in_queue):
        return HttpResponse(status=200,
                            content="[{'msg':'Queued Khooshe generation'}]")
    else:
        return HttpResponse(
            status=200,
            content="[{'msg':'Can't queue another Khooshe generation'}]")
Exemple #4
0
def return_points_khooshe(request, indexed_path, domain_name):
    '''
        Returns geo point for give file using khooshe
    '''
    core_name,_,_ = get_index_core(domain_name, indexed_path)

    total_docs, points_count = get_idx_details(domain_name, indexed_path)

    file_name = ''.join(ch for ch in core_name if ch not in exclude)

    results  = create_khooshe_result(GetIndexSize(core_name), total_docs, points_count, 
                                     get_idx_field_csv(domain_name, indexed_path),"static/tiles/{0}".format(file_name) )
    
    if results["rows_processed"]:
        return HttpResponse(status=200, content="[{0}]".format(results))
    else:
        return HttpResponse(status=400, content="Cannot find latitude and longitude(return_points_khooshe).")
Exemple #5
0
def search_crawled_index(request, indexed_path, domain_name, keyword):
    '''
    Searches a 'keyword' in 'indexed_path', using 'username', 'passwd'
    '''
    print "Searching for {0} in {1}".format(keyword, indexed_path)

    #Fetching stored data for domain name and index path from admin
    core_name, username, passwd = get_index_core(domain_name, indexed_path)
    
    keyword = urllib.quote_plus(keyword)

    url = "{0}/select?q=*{1}*&wt=json&rows=1".format(indexed_path, keyword)
    r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd))

    if r.status_code != 200:
        return HttpResponse(status=r.status_code, content=r.reason)

    response = r.json()
    numFound = response['response']['numFound']
    list_id = []
    print "Total number of records found {0}".format(numFound)

    # limiting search count to MAX_SEARCH_RESULT 
    if numFound > MAX_SEARCH_RESULT:
        numFound = MAX_SEARCH_RESULT
        print "Processing only {0} records".format(numFound)

    for row in range(0, int(numFound), QUERY_RANGE):  # loop solr query
        docs = {}
        url = "{0}/select?q=*{1}*&start={2}&rows={3}&wt=json&fl=id".format(indexed_path, keyword, row, QUERY_RANGE)
        print "solr query - {0}".format(url)
        r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd))
        response = r.json()
        docs = response['response']['docs']
        list_id += [doc["id"] for doc in docs]
    
    khooshe_tile_folder_name,points_count = SearchLocalSolrIndex(core_name, list_id, keyword)
    
    result = create_khooshe_result(len(list_id), GetIndexSize(core_name), points_count,
                                    get_idx_field_csv(domain_name, indexed_path), khooshe_tile_folder_name)

    if khooshe_tile_folder_name:
        return HttpResponse(status=200, content="[{0}]".format(str(result)))
    else:
        return HttpResponse(status=404, content="No points found for given search")
Exemple #6
0
def return_points_khooshe(request, indexed_path, domain_name):
    '''
        Returns geo point for give file using khooshe
    '''
    
    core_name = get_index_core(domain_name, indexed_path)
    results = {}
    
    results["rows_processed"] = GetIndexSize(core_name)
    results["total_docs"], results["points_count"] = get_idx_details(domain_name, indexed_path)
    
    exclude = set(string.punctuation)
    file_name = ''.join(ch for ch in core_name if ch not in exclude)
    results["khooshe_tile"] = "static/tiles/{0}".format(file_name)
    if results["rows_processed"]:
        return HttpResponse(status=200, content="[{0}]".format(results))
    else:
        return HttpResponse(status=400, content="Cannot find latitude and longitude(return_points_khooshe).")
Exemple #7
0
def return_points_khooshe(request, indexed_path, domain_name):
    '''
        Returns geo point for give file using khooshe
    '''

    core_name = get_index_core(domain_name, indexed_path)
    results = {}

    results["rows_processed"] = GetIndexSize(core_name)
    results["total_docs"], results["points_count"] = get_idx_details(
        domain_name, indexed_path)

    exclude = set(string.punctuation)
    file_name = ''.join(ch for ch in core_name if ch not in exclude)
    results["khooshe_tile"] = "static/tiles/{0}".format(file_name)
    if results["rows_processed"]:
        return HttpResponse(status=200, content="[{0}]".format(results))
    else:
        return HttpResponse(
            status=400,
            content="Cannot find latitude and longitude(return_points_khooshe)."
        )
Exemple #8
0
def query_crawled_index(request, domain_name, indexed_path):
    '''
        To query crawled data that has been indexed into
        Solr or Elastichsearch and return location names
    '''
    if "solr" in indexed_path.lower():
        '''
        Query admin core to get core information for domain_name, indexed_path combination
        '''
        core_name, username, passwd = get_index_core(domain_name, indexed_path)
        print core_name
        if create_core(core_name):
            # 1 query solr QUERY_RANGE records at a time
            # 2     Run GeotopicParser on each doc one at a time
            # 3     keep appending results 
            # 4 Save it in local solr instance
            rows_processed = 0
            try:
                rows_processed = GetIndexSize(core_name)
            except:
                pass
            try:
                url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path)
                r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd))
                
                if r.status_code != 200:
                    return HttpResponse(status=r.status_code, content=r.reason)

                response = r.json()
                numFound = response['response']['numFound']
                print "Total number of records to be geotagged {0}".format(numFound)
                #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                khooshe_gen_freq_l = rows_processed 
                for row in range(rows_processed, int(numFound), QUERY_RANGE):  # loop solr query
                    if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE):
                        print "Generating khooshe tiles.."
                        gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                        if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ):
                            khooshe_gen_freq_l += KHOOSHE_GEN_FREQ
                        else:
                            khooshe_gen_freq_l = (row + QUERY_RANGE) * 2
                    else:
                        print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format(row,khooshe_gen_freq_l)

                    docs = {}
                    url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format(indexed_path, row, QUERY_RANGE)
                    print "solr query - {0}".format(url)
                    r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd))
                    response = r.json()
                    text = response['response']['docs']
                    docCount = 0
                    for t in text:  # loop tika server starts
                        points = []
                        try:
                            docCount += 1
                            text_content = ''
                            try:
                                for v in t.values():
                                    if(hasattr(v, '__iter__')):
                                        a = u' '.join(unicode(e) for e in v)
                                    elif(isinstance(v, unicode)):
                                        a = v.encode('ascii', 'ignore')
                                    else:
                                        a = str(v)
                                    text_content += ' ' + a.encode('ascii', 'ignore')
                            except Exception as e:
                                print traceback.format_exc()
                                text_content = str(t.values())

                            # simplify text
                            text_content = ' '.join(text_content.split())
                            parsed = callServer('put', TIKA_SERVER, '/rmeta', text_content, {'Accept': 'application/json', 'Content-Type' : 'application/geotopic'}, False)
                            location_names = parse_lat_lon(eval(parsed[1])[0])

                            for key, values in location_names.iteritems():
                                try:
                                    # # TODO - ADD META DATA
                                    points.append(
                                        {'loc_name': smart_str(key),
                                        'position':{
                                            'x': smart_str(values[0]),
                                            'y': smart_str(values[1])
                                        }
                                        }
                                    )
                                except Exception as e:
                                    print "Error while transforming points "
                                    print e
                                    pass
                            print "Found {0} coordinates..".format(len(points))  
                            # print docs
                        except Exception as e:
                            print traceback.format_exc()
                            pass

                        docs[str(t['id'])] = points
                        # loop tika server ends
                    status = IndexCrawledPoints(core_name, docs)
                    print status
                    # loop solr query ends
                gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                return HttpResponse(status=200, content= ("Crawled data geo parsed successfully."))
            except Exception as e:
                print traceback.format_exc()
                print e
                return HttpResponse(status=500, content= ("Error while geo parsing crawled data."))

    else:
        return HttpResponse(status=500, content= ("Only solr indexes supported for now"))
Exemple #9
0
def query_crawled_index(request, domain_name, indexed_path, username, passwd):
    '''
        To query crawled data that has been indexed into
        Solr or Elastichsearch and return location names
    '''
    if "solr" in indexed_path.lower():
        '''
        Query admin core to get core information for domain_name, indexed_path combination
        '''
        core_name = get_index_core(domain_name, indexed_path)
        print core_name
        if create_core(core_name):
            # 1 query solr QUERY_RANGE records at a time
            # 2     Run GeotopicParser on each doc one at a time
            # 3     keep appending results
            # 4 Save it in local solr instance
            rows_processed = 0
            try:
                rows_processed = GetIndexSize(core_name)
            except:
                pass
            try:
                url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path)
                r = requests.get(url,
                                 headers=headers,
                                 auth=HTTPBasicAuth(username, passwd))

                if r.status_code != 200:
                    return HttpResponse(status=r.status_code, content=r.reason)

                response = r.json()
                numFound = response['response']['numFound']
                print "Total number of records to be geotagged {0}".format(
                    numFound)
                #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                khooshe_gen_freq_l = rows_processed
                for row in range(rows_processed, int(numFound),
                                 QUERY_RANGE):  # loop solr query
                    if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE):
                        print "Generating khooshe tiles.."
                        gen_khooshe_update_admin(core_name, domain_name,
                                                 indexed_path, numFound)
                        if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ):
                            khooshe_gen_freq_l += KHOOSHE_GEN_FREQ
                        else:
                            khooshe_gen_freq_l = (row + QUERY_RANGE) * 2
                    else:
                        print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format(
                            row, khooshe_gen_freq_l)

                    docs = {}
                    url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format(
                        indexed_path, row, QUERY_RANGE)
                    print "solr query - {0}".format(url)
                    r = requests.get(url,
                                     headers=headers,
                                     auth=HTTPBasicAuth(username, passwd))
                    response = r.json()
                    text = response['response']['docs']
                    docCount = 0
                    for t in text:  # loop tika server starts
                        points = []
                        try:
                            docCount += 1
                            text_content = ''
                            try:
                                for v in t.values():
                                    if (hasattr(v, '__iter__')):
                                        a = u' '.join(unicode(e) for e in v)
                                    elif (isinstance(v, unicode)):
                                        a = v.encode('ascii', 'ignore')
                                    else:
                                        a = str(v)
                                    text_content += a.encode('ascii', 'ignore')
                            except Exception as e:
                                print traceback.format_exc()
                                text_content = str(t.values())

                            # simplify text
                            text_content = ' '.join(text_content.split())

                            parsed = callServer(
                                'put', TIKA_SERVER, '/rmeta', text_content, {
                                    'Accept': 'application/json',
                                    'Content-Type': 'application/geotopic'
                                }, False)
                            location_names = parse_lat_lon(eval(parsed[1])[0])

                            for key, values in location_names.iteritems():
                                try:
                                    # # TODO - ADD META DATA
                                    points.append({
                                        'loc_name': smart_str(key),
                                        'position': {
                                            'x': smart_str(values[0]),
                                            'y': smart_str(values[1])
                                        }
                                    })
                                except Exception as e:
                                    print "Error while transforming points "
                                    print e
                                    pass
                            print "Found {0} coordinates..".format(len(points))
                            # print docs
                        except Exception as e:
                            print traceback.format_exc()
                            pass

                        docs[str(t['doi'])] = points
                        # loop tika server ends
                    status = IndexCrawledPoints(core_name, docs)
                    print status
                    # loop solr query ends
                gen_khooshe_update_admin(core_name, domain_name, indexed_path,
                                         numFound)
                return HttpResponse(
                    status=200,
                    content=("Crawled data geo parsed successfully."))
            except Exception as e:
                print traceback.format_exc()
                print e
                return HttpResponse(
                    status=500,
                    content=("Error while geo parsing crawled data."))

    else:
        return HttpResponse(status=500,
                            content=("Only solr indexes supported for now"))