Beispiel #1
0
def get_all_domain_details():
    resp = {}
    if create_core(ADMIN_CORE):
        url = '{0}{1}/select?q=*&wt=json'.format(SOLR_URL, ADMIN_CORE)
        response = requests.get(url, headers=headers)
        response = yaml.safe_load(response.text)['response']['docs']

        for doc in response:
            resp[doc["id"]] = doc[ADMIN_F_IDX_LIST]

        return resp
def get_idx_field_csv(domain, index_path):
    '''
	Returns field_csv from original index to be shown on popups
	'''
    if create_core(ADMIN_CORE):
        response = _get_domain_admin(domain)['response']['docs'][0]
        all_idx = response[ADMIN_F_IDX_LIST]
        if (index_path in all_idx):
            index_arr = all_idx.index(index_path)
            return response[ADMIN_F_IDX_FIELD_LIST][index_arr]

    return 0, 0
Beispiel #3
0
def get_index_core(domain, index_path):
    # TODO strip trailing /

    if create_core(ADMIN_CORE):
        response = _get_domain_admin(domain)

        num_found = response['response']['numFound']
        if (num_found == 0):  # # No record found for this domain.
            count = 1  # # Initialize a new one
        else:
            # Check if this index exist for this domain
            all_idx = response['response']['docs'][0][ADMIN_F_IDX_LIST]
            if (index_path in all_idx):
                index_arr = all_idx.index(index_path)
                core_name = response['response']['docs'][0][ADMIN_F_CORE_LIST][
                    index_arr]
                return core_name
            # if not create a new count for this index
            count = len(all_idx) + 1

        # get unique core name
        core_name = "{0}_{1}".format(domain, count)
        payload = {
            "add": {
                "doc": {
                    "id": "{0}".format(domain),
                    ADMIN_F_IDX_LIST: {
                        "add": "{0}".format(index_path)
                    },
                    ADMIN_F_CORE_LIST: {
                        "add": core_name
                    },
                    ADMIN_F_PNT_LEN_LIST: {
                        "add": 0
                    },
                    ADMIN_F_IDX_SIZE_LIST: {
                        "add": 0
                    }
                }
            }
        }

        r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE),
                          data=str(payload),
                          params=params,
                          headers=headers)

        print r.text
        if (not r.ok):
            raise "Can't create core with core name {0}".format(core_name)

        return core_name
Beispiel #4
0
def get_idx_details(domain, index_path):
    '''
	Return size of original index and number of points found till now
	'''
    if create_core(ADMIN_CORE):
        response = _get_domain_admin(domain)['response']['docs'][0]
        all_idx = response[ADMIN_F_IDX_LIST]
        if (index_path in all_idx):
            index_arr = all_idx.index(index_path)
            return response[ADMIN_F_IDX_SIZE_LIST][index_arr], response[
                ADMIN_F_PNT_LEN_LIST][index_arr]

    return 0, 0
def delete_index_core(domain, index_path):
    # TODO strip trailing /

    if create_core(ADMIN_CORE):
        response = _get_domain_admin(domain)

        num_found = response['response']['numFound']
        if (num_found == 0):  # # No record found for this domain.
            return "No domain added with name - " + domain
        else:
            # Check if this index exist for this domain
            all_idx = response['response']['docs'][0][ADMIN_F_IDX_LIST]
            if (index_path not in all_idx):
                return "No index added with name {0} for domain {1} ".format(
                    index_path, domain)

        index_in_arr = all_idx.index(index_path)

        new_doc = response['response']['docs'][0]
        print "Data now -", new_doc
        del (new_doc[ADMIN_F_IDX_LIST][index_in_arr])
        del (new_doc[ADMIN_F_CORE_LIST][index_in_arr])
        del (new_doc[ADMIN_F_PNT_LEN_LIST][index_in_arr])
        del (new_doc[ADMIN_F_IDX_SIZE_LIST][index_in_arr])
        del (new_doc[ADMIN_F_IDX_FIELD_LIST][index_in_arr])
        del (new_doc[ADMIN_F_USER_LIST][index_in_arr])
        del (new_doc[ADMIN_F_PASSWD_LIST][index_in_arr])

        print "Updated data - ", new_doc

        payload = {"add": {"doc": new_doc}}

        r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE),
                          data=str(payload),
                          params=params,
                          headers=headers)

        print r.text
        if (not r.ok):
            print "Can't delete index with name {0} for domain {1} ".format(
                index_path, domain)
        else:
            print "Deleted index with name {0} for domain {1} ".format(
                index_path, domain)
Beispiel #6
0
def update_idx_details(domain, index_path, idx_size, pnt_size):
    '''
	Updates size of original index and number of points found till now
	'''
    if create_core(ADMIN_CORE):
        response = _get_domain_admin(domain)['response']['docs'][0]
        all_idx = response[ADMIN_F_IDX_LIST]

        if (index_path in all_idx):
            index_arr = all_idx.index(index_path)
            response[ADMIN_F_PNT_LEN_LIST][index_arr] = pnt_size
            response[ADMIN_F_IDX_SIZE_LIST][index_arr] = idx_size

        payload = {
            "add": {
                "doc": {
                    "id": "{0}".format(domain),
                    ADMIN_F_PNT_LEN_LIST: {
                        "set": response[ADMIN_F_PNT_LEN_LIST]
                    },
                    ADMIN_F_IDX_SIZE_LIST: {
                        "set": response[ADMIN_F_IDX_SIZE_LIST]
                    }
                }
            }
        }
        r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE),
                          data=str(payload),
                          params=params,
                          headers=headers)

        print r.text
        if (not r.ok):
            print payload
            raise "Can't update idx details with core name {0} - {1}".format(
                domain, index_path)

        return True

    return False
def update_idx_field_csv(domain, index_path, idx_field_csv):
    '''
	Updates field_csv from original index to be shown on popups
	'''
    if create_core(ADMIN_CORE):
        response = _get_domain_admin(domain)['response']['docs']
        if len(response) == 1:
            response = response[0]
            all_idx = response[ADMIN_F_IDX_LIST]

            if (index_path in all_idx):
                index_arr = all_idx.index(index_path)
                response[ADMIN_F_IDX_FIELD_LIST][index_arr] = "{0}".format(
                    idx_field_csv)

            payload = {
                "add": {
                    "doc": {
                        "id": "{0}".format(domain),
                        ADMIN_F_IDX_FIELD_LIST: {
                            "set": response[ADMIN_F_IDX_FIELD_LIST]
                        }
                    }
                }
            }
            r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE),
                              data=str(payload),
                              params=params,
                              headers=headers)

            print r.text
            if (not r.ok):
                print payload
                raise "Can't update idx details with core name {0} - {1}".format(
                    domain, index_path)

            return True
            #
    return False
Beispiel #8
0
def query_crawled_index(request, domain_name, indexed_path, username, passwd):
    '''
        To query crawled data that has been indexed into
        Solr or Elastichsearch and return location names
    '''
    if "solr" in indexed_path.lower():
        '''
        Query admin core to get core information for domain_name, indexed_path combination
        '''
        core_name = get_index_core(domain_name, indexed_path)
        print core_name
        if create_core(core_name):
            # 1 query solr QUERY_RANGE records at a time
            # 2     Run GeotopicParser on each doc one at a time
            # 3     keep appending results
            # 4 Save it in local solr instance
            rows_processed = 0
            try:
                rows_processed = GetIndexSize(core_name)
            except:
                pass
            try:
                url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path)
                r = requests.get(url,
                                 headers=headers,
                                 auth=HTTPBasicAuth(username, passwd))

                if r.status_code != 200:
                    return HttpResponse(status=r.status_code, content=r.reason)

                response = r.json()
                numFound = response['response']['numFound']
                print "Total number of records to be geotagged {0}".format(
                    numFound)
                #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                khooshe_gen_freq_l = rows_processed
                for row in range(rows_processed, int(numFound),
                                 QUERY_RANGE):  # loop solr query
                    if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE):
                        print "Generating khooshe tiles.."
                        gen_khooshe_update_admin(core_name, domain_name,
                                                 indexed_path, numFound)
                        if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ):
                            khooshe_gen_freq_l += KHOOSHE_GEN_FREQ
                        else:
                            khooshe_gen_freq_l = (row + QUERY_RANGE) * 2
                    else:
                        print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format(
                            row, khooshe_gen_freq_l)

                    docs = {}
                    url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format(
                        indexed_path, row, QUERY_RANGE)
                    print "solr query - {0}".format(url)
                    r = requests.get(url,
                                     headers=headers,
                                     auth=HTTPBasicAuth(username, passwd))
                    response = r.json()
                    text = response['response']['docs']
                    docCount = 0
                    for t in text:  # loop tika server starts
                        points = []
                        try:
                            docCount += 1
                            text_content = ''
                            try:
                                for v in t.values():
                                    if (hasattr(v, '__iter__')):
                                        a = u' '.join(unicode(e) for e in v)
                                    elif (isinstance(v, unicode)):
                                        a = v.encode('ascii', 'ignore')
                                    else:
                                        a = str(v)
                                    text_content += a.encode('ascii', 'ignore')
                            except Exception as e:
                                print traceback.format_exc()
                                text_content = str(t.values())

                            # simplify text
                            text_content = ' '.join(text_content.split())

                            parsed = callServer(
                                'put', TIKA_SERVER, '/rmeta', text_content, {
                                    'Accept': 'application/json',
                                    'Content-Type': 'application/geotopic'
                                }, False)
                            location_names = parse_lat_lon(eval(parsed[1])[0])

                            for key, values in location_names.iteritems():
                                try:
                                    # # TODO - ADD META DATA
                                    points.append({
                                        'loc_name': smart_str(key),
                                        'position': {
                                            'x': smart_str(values[0]),
                                            'y': smart_str(values[1])
                                        }
                                    })
                                except Exception as e:
                                    print "Error while transforming points "
                                    print e
                                    pass
                            print "Found {0} coordinates..".format(len(points))
                            # print docs
                        except Exception as e:
                            print traceback.format_exc()
                            pass

                        docs[str(t['doi'])] = points
                        # loop tika server ends
                    status = IndexCrawledPoints(core_name, docs)
                    print status
                    # loop solr query ends
                gen_khooshe_update_admin(core_name, domain_name, indexed_path,
                                         numFound)
                return HttpResponse(
                    status=200,
                    content=("Crawled data geo parsed successfully."))
            except Exception as e:
                print traceback.format_exc()
                print e
                return HttpResponse(
                    status=500,
                    content=("Error while geo parsing crawled data."))

    else:
        return HttpResponse(status=500,
                            content=("Only solr indexes supported for now"))
def get_index_core(domain, index_path, user="******", passwd="pass"):
    # TODO strip trailing /

    if create_core(ADMIN_CORE):
        response = _get_domain_admin(domain)

        num_found = response['response']['numFound']
        if (num_found == 0):  # # No record found for this domain.
            count = 1  # # Initialize a new one
        else:
            # Check if this index exist for this domain
            all_idx = response['response']['docs'][0][ADMIN_F_IDX_LIST]
            count = response['response']['docs'][0][ADMIN_F_COUNT][0]
            if (index_path in all_idx):
                index_arr = all_idx.index(index_path)
                core_name = response['response']['docs'][0][ADMIN_F_CORE_LIST][
                    index_arr]
                # todo encrypt it
                stored_user = response['response']['docs'][0][
                    ADMIN_F_USER_LIST][index_arr]
                stored_passwd = response['response']['docs'][0][
                    ADMIN_F_PASSWD_LIST][index_arr]
                # return existing core name with user name and passwprd
                return core_name, stored_user, crypter.decrypt(stored_passwd)
            # if not create a new count for this index
            print "No existing core found for ", domain, index_path
            count = count + 1

        # get unique core name
        core_name = "{0}_{1}".format(domain, count)
        payload = {
            "add": {
                "doc": {
                    "id": "{0}".format(domain),
                    ADMIN_F_IDX_LIST: {
                        "add": "{0}".format(index_path)
                    },
                    ADMIN_F_CORE_LIST: {
                        "add": core_name
                    },
                    ADMIN_F_PNT_LEN_LIST: {
                        "add": 0
                    },
                    ADMIN_F_IDX_SIZE_LIST: {
                        "add": 0
                    },
                    ADMIN_F_IDX_FIELD_LIST: {
                        "add": DEFAULT_IDX_FIELD
                    },
                    ADMIN_F_USER_LIST: {
                        "add": "{0}".format(user)
                    },
                    ADMIN_F_PASSWD_LIST: {
                        "add": "{0}".format(crypter.encrypt(passwd))
                    },
                    ADMIN_F_COUNT: {
                        "set": count
                    }
                }
            }
        }

        r = requests.post("{0}{1}/update".format(SOLR_URL, ADMIN_CORE),
                          data=str(payload),
                          params=params,
                          headers=headers)

        print r.text
        if (not r.ok):
            raise "Can't create core with core name {0}".format(core_name)
            return

        # return newly created core
        return core_name, user, passwd