def tikaGeoExtract(esHit, geoField):
    text = getField(geoField, esHit['_source'])
    if text == None: return None
    res = callServer('put', ServerEndpoint, '/rmeta', text, {'Accept' : 'application/json', 'Content-Type' : 'application/geotopic'}, False)
    if res[0] != 200:
        return None
    jsonParse = json.loads(res[1])
    return jsonParse[0]
Example #2
0
    def parse(self,
              option: str,
              url_or_path: str,
              server_endpoint: str = None,
              verbose: int = 0,
              tika_server_jar: str = None,
              response_mime_type: str = 'application/json',
              services: dict = None,
              raw_response: bool = False,
              extra_headers: Dict[str, str] = None) -> Dict:
        """
        The method is called from parse_file_on_server to parse the file
        calling Tika as a server.
        :param option: command line options to send to Tika's server
        :param url_or_path: local path (or URL) to the file being parsed
        :param server_endpoint: Tika server's URL
        :param verbose: make Tika produse verbose log
        :param tika_server_jar: path to Tika's JAR file
        :param response_mime_type: response format (application/json) for plain text + metadata in JSON format
        :param services:
        :param raw_response: get raw response from Tika (text + metadata + warnings), False by default
        :param extra_headers: extra request header
        :return: dictionary with "content" (text) and "metadata" (another dictionary) keys
        """

        services = services if services else \
            {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}
        tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path
        server_endpoint = server_endpoint if server_endpoint else self.server_endpoint

        path, file_type = getRemoteFile(url_or_path, self.tika_files_path)
        service = services.get(option, services['all'])
        if service == '/tika':
            response_mime_type = 'text/plain'
        content_path = self.make_content_disposition_header(path)

        headers = {
            'Accept': response_mime_type,
            'Content-Disposition': content_path
        }
        if extra_headers:
            headers = {**headers, **extra_headers}

        status, response = callServer('put',
                                      server_endpoint,
                                      service,
                                      open(path, 'rb'),
                                      headers,
                                      verbose,
                                      tika_server_jar,
                                      rawResponse=raw_response)

        if file_type == 'remote':
            os.unlink(path)
        return _parse((status, response))
Example #3
0
    def __call(self, params={}):
        (status, response) = callServer('put', TIKA_SERVER, '/rmeta',
                                        open(self.file), params)

        if status != 200:
            raise "Tika Parse Exception"

        d = self.modules["json"].loads(response)[0]

        if 'X-TIKA:content' in d:
            content = d.pop('X-TIKA:content')
        else:
            content = ''

        return {'metadata': d, 'content': content}
  def __call(self, params={}):
    (status, response) = callServer('put', TIKA_SERVER, '/rmeta', open(self.file),  params)

    if status != 200:
      raise "Tika Parse Exception"

    d = self.modules["json"].loads(response)[0]

    if 'X-TIKA:content' in d:
      content = d.pop('X-TIKA:content')
    else:
      content = ''

    return {
      'metadata': d,
      'content' : content
    }
Example #5
0
def find_location(request, file_name):
    '''
        Find location name from extracted text using Geograpy.
    '''
    if "none" in IndexStatus("locations", file_name):
        text_content = QueryText(file_name)
        if text_content:
            parsed = callServer('put', TIKA_SERVER, '/rmeta', text_content, {'Accept': 'application/json', 'Content-Type' : 'application/geotopic'}, False)
            points = parse_lat_lon(eval(parsed[1])[0])
            
            status = IndexLocationName(file_name, points)
            if status[0]:
                return HttpResponse(status=200, content="Location/s found and index to Solr.")
            else:
                return HttpResponse(status=400, content=status[1])
        else:
            return HttpResponse(status=400, content="Cannot find location.")
    else:
        return HttpResponse(status=200, content="Loading...")
Example #6
0
    def parse(self,
              option: str,
              url_or_path: str,
              server_endpoint: str = None,
              verbose: int = 0,
              tika_server_jar: str = None,
              response_mime_type: str = 'application/json',
              services: dict = None,
              raw_response: bool = False,
              extra_headers: Dict[str, str] = None) -> Dict:

        services = services if services else \
            {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}
        tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path
        server_endpoint = server_endpoint if server_endpoint else self.server_endpoint

        path, file_type = getRemoteFile(url_or_path, self.tika_files_path)
        service = services.get(option, services['all'])
        if service == '/tika':
            response_mime_type = 'text/plain'
        content_path = self.make_content_disposition_header(path)

        headers = {
            'Accept': response_mime_type,
            'Content-Disposition': content_path
        }
        if extra_headers:
            headers = {**headers, **extra_headers}

        status, response = callServer('put',
                                      server_endpoint,
                                      service,
                                      open(path, 'rb'),
                                      headers,
                                      verbose,
                                      tika_server_jar,
                                      rawResponse=raw_response)

        if file_type == 'remote':
            os.unlink(path)
        return _parse((status, response))
Example #7
0
def find_location(request, file_name):
    '''
        Find location name from extracted text using Geograpy.
    '''
    if "none" in IndexStatus("locations", file_name):
        text_content = QueryText(file_name)
        if text_content:
            parsed = callServer(
                'put', TIKA_SERVER, '/rmeta', text_content, {
                    'Accept': 'application/json',
                    'Content-Type': 'application/geotopic'
                }, False)
            points = parse_lat_lon(eval(parsed[1])[0])

            status = IndexLocationName(file_name, points)
            if status[0]:
                return HttpResponse(
                    status=200, content="Location/s found and index to Solr.")
            else:
                return HttpResponse(status=400, content=status[1])
        else:
            return HttpResponse(status=400, content="Cannot find location.")
    else:
        return HttpResponse(status=200, content="Loading...")
Example #8
0
def query_crawled_index(request, domain_name, indexed_path):
    '''
        To query crawled data that has been indexed into
        Solr or Elastichsearch and return location names
    '''
    if "solr" in indexed_path.lower():
        '''
        Query admin core to get core information for domain_name, indexed_path combination
        '''
        core_name, username, passwd = get_index_core(domain_name, indexed_path)
        print core_name
        if create_core(core_name):
            # 1 query solr QUERY_RANGE records at a time
            # 2     Run GeotopicParser on each doc one at a time
            # 3     keep appending results 
            # 4 Save it in local solr instance
            rows_processed = 0
            try:
                rows_processed = GetIndexSize(core_name)
            except:
                pass
            try:
                url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path)
                r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd))
                
                if r.status_code != 200:
                    return HttpResponse(status=r.status_code, content=r.reason)

                response = r.json()
                numFound = response['response']['numFound']
                print "Total number of records to be geotagged {0}".format(numFound)
                #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                khooshe_gen_freq_l = rows_processed 
                for row in range(rows_processed, int(numFound), QUERY_RANGE):  # loop solr query
                    if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE):
                        print "Generating khooshe tiles.."
                        gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                        if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ):
                            khooshe_gen_freq_l += KHOOSHE_GEN_FREQ
                        else:
                            khooshe_gen_freq_l = (row + QUERY_RANGE) * 2
                    else:
                        print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format(row,khooshe_gen_freq_l)

                    docs = {}
                    url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format(indexed_path, row, QUERY_RANGE)
                    print "solr query - {0}".format(url)
                    r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd))
                    response = r.json()
                    text = response['response']['docs']
                    docCount = 0
                    for t in text:  # loop tika server starts
                        points = []
                        try:
                            docCount += 1
                            text_content = ''
                            try:
                                for v in t.values():
                                    if(hasattr(v, '__iter__')):
                                        a = u' '.join(unicode(e) for e in v)
                                    elif(isinstance(v, unicode)):
                                        a = v.encode('ascii', 'ignore')
                                    else:
                                        a = str(v)
                                    text_content += ' ' + a.encode('ascii', 'ignore')
                            except Exception as e:
                                print traceback.format_exc()
                                text_content = str(t.values())

                            # simplify text
                            text_content = ' '.join(text_content.split())
                            parsed = callServer('put', TIKA_SERVER, '/rmeta', text_content, {'Accept': 'application/json', 'Content-Type' : 'application/geotopic'}, False)
                            location_names = parse_lat_lon(eval(parsed[1])[0])

                            for key, values in location_names.iteritems():
                                try:
                                    # # TODO - ADD META DATA
                                    points.append(
                                        {'loc_name': smart_str(key),
                                        'position':{
                                            'x': smart_str(values[0]),
                                            'y': smart_str(values[1])
                                        }
                                        }
                                    )
                                except Exception as e:
                                    print "Error while transforming points "
                                    print e
                                    pass
                            print "Found {0} coordinates..".format(len(points))  
                            # print docs
                        except Exception as e:
                            print traceback.format_exc()
                            pass

                        docs[str(t['id'])] = points
                        # loop tika server ends
                    status = IndexCrawledPoints(core_name, docs)
                    print status
                    # loop solr query ends
                gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                return HttpResponse(status=200, content= ("Crawled data geo parsed successfully."))
            except Exception as e:
                print traceback.format_exc()
                print e
                return HttpResponse(status=500, content= ("Error while geo parsing crawled data."))

    else:
        return HttpResponse(status=500, content= ("Only solr indexes supported for now"))
Example #9
0
def extract_geo_from_doc(doc, tika):
    """
    Uses Tika to extract geo data and returns an updated copy of the document.

    This script utilizes GeoTopicParser: https://wiki.apache.org/tika/GeoTopicParser
    Make sure to follow the set up steps at the link above to set up location-ner-model and geotopic-mime.

    This script queries documents from the Solr source core and extracts geo data from the document's 'content' field.
    GeoTopicParser returns a set of location names and coordinates. This information is added to each document in the
    fields 'location_names' and 'location_coordinates' respectively, and the new document is indexed to the destination
    core.

    Make sure you have:
      - a lucene-geo-gazetteer server running.
      - a Tika server running with location-ner-model and geotopic-mime in the classpath. Set the Tika server url below.
      - a Solr server running. Set the source core url (the core of documents you want to process)
          and the destination core url (where the enriched documents will go).

    This script assumes Tika is set up with GeoTopicParser. If any of the above is not satisfied, documents will still be
    indexed into the destination core, but will not have any new data. If you do not configure your own Tika server,
    tika-python will attempt to start its own server, which will not have the necessary items in the classpath.
    Additionally, the gazetteer must be running for the Geo extractions to be performed.

    :param doc: A document from solr, as a dict
    :return: Updated document, as a dict, or None if error
    """
    try:
        names = []
        coords = []
        if 'content' in doc:
            content = doc['content']
            if type(content) is list:
                for c in content:
                    res = callServer('put', tika, '/rmeta', c, {'Accept' : 'application/json', 'Content-Type' : 'application/geotopic'}, False)
                    if res[0] == 200:
                        parsed = res[1]
                        parsed_json = json.loads(parsed)
                        for item in parsed_json:
                            # Group long/lat/name by index
                            geo_groups = {}
                            for key in item.keys():
                                reg = re.findall(r'Optional_([a-zA-Z]+)(\d+)', key)
                                if reg:
                                    attr = str(reg[0][0]).lower()
                                    n = str(reg[0][1])
                                    if n not in geo_groups.keys():
                                        geo_groups[n] = {}
                                    geo_groups[n][attr] = item[key]

                            for key, value in geo_groups.iteritems():
                                geokeys = value.keys()
                                if 'name' in geokeys:
                                    names.append(value['name'])
                                lat = ""
                                longd = ""
                                if 'latitude' in geokeys:
                                    lat = str(value['latitude'])
                                if 'longitude' in geokeys:
                                    longd = str(value['longitude'])
                                coords.append(lat + ',' + longd)

        # now we have all names grouped, all coordinates grouped
        enriched = copy.deepcopy(doc)
        enriched['location_name'] = names
        enriched['location_coordinates'] = coords

        return enriched
    except Exception as e:
        print(e.message)
        return None
Example #10
0
def query_crawled_index(request, domain_name, indexed_path, username, passwd):
    '''
        To query crawled data that has been indexed into
        Solr or Elastichsearch and return location names
    '''
    if "solr" in indexed_path.lower():
        '''
        Query admin core to get core information for domain_name, indexed_path combination
        '''
        core_name = get_index_core(domain_name, indexed_path)
        print core_name
        if create_core(core_name):
            # 1 query solr QUERY_RANGE records at a time
            # 2     Run GeotopicParser on each doc one at a time
            # 3     keep appending results
            # 4 Save it in local solr instance
            rows_processed = 0
            try:
                rows_processed = GetIndexSize(core_name)
            except:
                pass
            try:
                url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path)
                r = requests.get(url,
                                 headers=headers,
                                 auth=HTTPBasicAuth(username, passwd))

                if r.status_code != 200:
                    return HttpResponse(status=r.status_code, content=r.reason)

                response = r.json()
                numFound = response['response']['numFound']
                print "Total number of records to be geotagged {0}".format(
                    numFound)
                #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound)
                khooshe_gen_freq_l = rows_processed
                for row in range(rows_processed, int(numFound),
                                 QUERY_RANGE):  # loop solr query
                    if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE):
                        print "Generating khooshe tiles.."
                        gen_khooshe_update_admin(core_name, domain_name,
                                                 indexed_path, numFound)
                        if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ):
                            khooshe_gen_freq_l += KHOOSHE_GEN_FREQ
                        else:
                            khooshe_gen_freq_l = (row + QUERY_RANGE) * 2
                    else:
                        print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format(
                            row, khooshe_gen_freq_l)

                    docs = {}
                    url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format(
                        indexed_path, row, QUERY_RANGE)
                    print "solr query - {0}".format(url)
                    r = requests.get(url,
                                     headers=headers,
                                     auth=HTTPBasicAuth(username, passwd))
                    response = r.json()
                    text = response['response']['docs']
                    docCount = 0
                    for t in text:  # loop tika server starts
                        points = []
                        try:
                            docCount += 1
                            text_content = ''
                            try:
                                for v in t.values():
                                    if (hasattr(v, '__iter__')):
                                        a = u' '.join(unicode(e) for e in v)
                                    elif (isinstance(v, unicode)):
                                        a = v.encode('ascii', 'ignore')
                                    else:
                                        a = str(v)
                                    text_content += a.encode('ascii', 'ignore')
                            except Exception as e:
                                print traceback.format_exc()
                                text_content = str(t.values())

                            # simplify text
                            text_content = ' '.join(text_content.split())

                            parsed = callServer(
                                'put', TIKA_SERVER, '/rmeta', text_content, {
                                    'Accept': 'application/json',
                                    'Content-Type': 'application/geotopic'
                                }, False)
                            location_names = parse_lat_lon(eval(parsed[1])[0])

                            for key, values in location_names.iteritems():
                                try:
                                    # # TODO - ADD META DATA
                                    points.append({
                                        'loc_name': smart_str(key),
                                        'position': {
                                            'x': smart_str(values[0]),
                                            'y': smart_str(values[1])
                                        }
                                    })
                                except Exception as e:
                                    print "Error while transforming points "
                                    print e
                                    pass
                            print "Found {0} coordinates..".format(len(points))
                            # print docs
                        except Exception as e:
                            print traceback.format_exc()
                            pass

                        docs[str(t['doi'])] = points
                        # loop tika server ends
                    status = IndexCrawledPoints(core_name, docs)
                    print status
                    # loop solr query ends
                gen_khooshe_update_admin(core_name, domain_name, indexed_path,
                                         numFound)
                return HttpResponse(
                    status=200,
                    content=("Crawled data geo parsed successfully."))
            except Exception as e:
                print traceback.format_exc()
                print e
                return HttpResponse(
                    status=500,
                    content=("Error while geo parsing crawled data."))

    else:
        return HttpResponse(status=500,
                            content=("Only solr indexes supported for now"))
Example #11
0
def extract_geo_from_doc(doc, tika):
    """
    Uses Tika to extract geo data and returns an updated copy of the document.

    This script utilizes GeoTopicParser: https://wiki.apache.org/tika/GeoTopicParser
    Make sure to follow the set up steps at the link above to set up location-ner-model and geotopic-mime.

    This script queries documents from the Solr source core and extracts geo data from the document's 'content' field.
    GeoTopicParser returns a set of location names and coordinates. This information is added to each document in the
    fields 'location_names' and 'location_coordinates' respectively, and the new document is indexed to the destination
    core.

    Make sure you have:
      - a lucene-geo-gazetteer server running.
      - a Tika server running with location-ner-model and geotopic-mime in the classpath. Set the Tika server url below.
      - a Solr server running. Set the source core url (the core of documents you want to process)
          and the destination core url (where the enriched documents will go).

    This script assumes Tika is set up with GeoTopicParser. If any of the above is not satisfied, documents will still be
    indexed into the destination core, but will not have any new data. If you do not configure your own Tika server,
    tika-python will attempt to start its own server, which will not have the necessary items in the classpath.
    Additionally, the gazetteer must be running for the Geo extractions to be performed.

    :param doc: A document from solr, as a dict
    :return: Updated document, as a dict, or None if error
    """
    try:
        names = []
        coords = []
        if 'content' in doc:
            content = doc['content']
            if type(content) is list:
                for c in content:
                    res = callServer(
                        'put', tika, '/rmeta', c, {
                            'Accept': 'application/json',
                            'Content-Type': 'application/geotopic'
                        }, False)
                    if res[0] == 200:
                        parsed = res[1]
                        parsed_json = json.loads(parsed)
                        for item in parsed_json:
                            # Group long/lat/name by index
                            geo_groups = {}
                            for key in item.keys():
                                reg = re.findall(r'Optional_([a-zA-Z]+)(\d+)',
                                                 key)
                                if reg:
                                    attr = str(reg[0][0]).lower()
                                    n = str(reg[0][1])
                                    if n not in geo_groups.keys():
                                        geo_groups[n] = {}
                                    geo_groups[n][attr] = item[key]

                            for key, value in geo_groups.iteritems():
                                geokeys = value.keys()
                                if 'name' in geokeys:
                                    names.append(value['name'])
                                lat = ""
                                longd = ""
                                if 'latitude' in geokeys:
                                    lat = str(value['latitude'])
                                if 'longitude' in geokeys:
                                    longd = str(value['longitude'])
                                coords.append(lat + ',' + longd)

        # now we have all names grouped, all coordinates grouped
        enriched = copy.deepcopy(doc)
        enriched['location_name'] = names
        enriched['location_coordinates'] = coords

        return enriched
    except Exception as e:
        print(e.message)
        return None