class SearchifyIndex(): """ Gestor para indice searchify""" def __init__(self): self.api_client = ApiClient(settings.SEARCHIFY['api_url'][0]) self.index = self.api_client.get_index(settings.SEARCHIFY['index']) def indexit(self, docs): logger = logging.getLogger(__name__) if not docs: logger.error("Ningun documento para indexar") return False else: logger.info('Add to index %s' % str(docs)) result = self.index.add_documents(docs) return result def count(self): return self.index.get_size() def delete_documents(self, docs): if not docs: return None # Devuelvo true temporalmente hasta implementar Haystack return True # Comantado por falla en indextank al eliminar del indice. #status_code, response = self.index.delete_documents(docs) #if response: # failed_documents = [] # for i in xrange(len(response)): # if not response[i]['deleted']: # failed_documents.append(response[i]) #TODO podria usarse el metodo individual self.delete_document para un segundo intento # return len(failed_documents) == 0 #else: # return False def delete_document(self, doc): # Devuelvo true temporalmente hasta implementar Haystack return True
class RoutineBackend(object): def __init__(self): self.api = ApiClient('http://indxtank-api-key') self.index = self.api.get_index('index-name') self.sierpinski = Sierpinski('co.txt') @cherrypy.expose def index(self, _): return "show_search('Hello World!')" @cherrypy.expose def sort(self, lathome, lnghome, stops, cback, _=''): cherrypy.response.headers['Content-Type'] = 'application/javascript' st = stops.split(',') home = { 'lat': float(lathome), 'lng': float(lnghome), 'uid': 0, 'bubble': 'home' } query = 'uuid:' bubble_template = """<h3>%(name)s</h3> <p>%(address)s, %(city)s, %(state)s</p>""" self.sierpinski.reset() self.sierpinski.add_point(home) for stop in st: re = self.index.search(query + stop, fetch_fields=[ 'name', 'address', 'city', 'state', 'telephone', 'website', 'latitude', 'longitude', 'docid' ])['results'][0] bs = {} re['name'] = re['name'].replace("'", "") bs['bubble'] = bubble_template % re if re['telephone'] != '': bs['bubble'] += '<p>Telephone: ' + re['telephone'] + '</p>' if re['website'] != '': bs['bubble'] += '<p>Website: <a href="' + re[ 'website'] + '" target="_blank">' + re[ 'website'] + '</a></p>' bs['lat'] = float(re['latitude']) bs['lng'] = float(re['longitude']) bs['uid'] = re['docid'] self.sierpinski.add_point(bs) data = {'bss': self.sierpinski.shortest_route()} return cback + "(" + json.dumps(data) + ");" @cherrypy.expose def search(self, nameortype, address, lat, lng, cback, _=''): cherrypy.response.headers['Content-Type'] = 'application/javascript' if (nameortype == '' and address == ''): return cback + '(' + json.dumps({'bss': []}) + ');' query = 'name:"' + nameortype + '" OR category:"' + nameortype + '"' if (address): query = '( ' + query + ' ) AND ( address:"' + address + '" OR city:"' + address + '" )' #self.index.add_function(1, '-miles(query.var[0], query.var[1], doc.var[0], doc.var[1])') results = self.index.search(query, fetch_fields=[ 'name', 'address', 'city', 'state', 'latitude', 'longitude', 'docid' ], variables={ 0: float(lat), 1: float(lng) }, scoring_function=1) bss = results['results'] resp = [] binfo_template = """<h3>%(name)s</h3> <p>%(address)s, %(city)s, %(state)s</p>""" bubble_template = binfo_template bubble_template += """<p class='align-right'> (<a href='javascript:add_stop("%(docid)s", " """ + binfo_template + """ ");'>Add stop</a>) </p>""" for bs in bss: bs['name'] = bs['name'].replace("'", "") bsjs = {} bsjs['bubble'] = bubble_template % bs bsjs['lat'] = bs['latitude'] bsjs['lng'] = bs['longitude'] bsjs['uid'] = bs['docid'] resp.append(bsjs) return cback + '(' + json.dumps({'bss': resp}) + ');'
#! /usr/bin/python import string; import re; import urllib; from indextank.client import ApiClient; indexapi = ApiClient('http://:[email protected]'); index = indexapi.get_index('techcrunch'); f = open("companies.txt"); fw = open("intermediate.txt", 'w'); i = 0; for line in f : p = re.compile(r'{"name":'); list = p.split(line.rstrip('"\n')); if len(list) > 1: url = "http://api.crunchbase.com/v/1/company/" + list[1].rstrip('",').lstrip(' "')+".js" ; print url; print "Company:" + list[1].rstrip('",').lstrip(' "'); companyid = list[1].rstrip('",').lstrip(' "'); fw.write("Company:" + list[1].rstrip('",').lstrip(' "')+" "); page = urllib.urlopen(url); buf = page.read(); if re.search("error",buf) : fw.write("\n"); continue; ext = re.compile('("homepage_url"):(.*)'); homepage =ext.search(buf).group(2).strip(' "|",')+"\n"; fw.write(ext.search(buf).group(1).strip('"') + ":" + ext.search(buf).group(2).strip(' "|",')+" "); jobs = ext.search(buf).group(2).rstrip(',"')+"/jobs"; ext = re.compile('("category_code"):(.*)'); fw.write(ext.search(buf).group(1).strip('"') + ":" + ext.search(buf).group(2).strip(' "|",')+" "); category = ext.search(buf).group(2).strip(' "|",');
class SearchIndexer: def __init__(self, index_name, source_root, api_url): self.index_name = index_name self.source_root = source_root self.root_len = len(source_root) self.api = ApiClient(api_url) index = None try: index = self.api.get_index(index_name) logger.debug('Found searchify index %s' % index_name) except: public_search_enabled = True logger.debug('Creating searchify index %s' % index_name) index = self.api.create_index(index_name, public_search_enabled) while not index.has_started(): time.sleep(0.5) logger.debug('Searchify index %s started' % index_name) if index is None: logger.fatal('Could not create or get index %s' % index_name) sys.exit(1) self.index = index def _index_html(self, content, path): if content.content is None: logger.debug('skipping html index for %s - no content' % path) return # Available metadata: 'author', 'basename_raw', 'date', # 'email', 'exported_type', 'modified', 'relative_url', # 'slug', 'source_id', 'source_type', 'summary', 'template', # 'sorted_title', 'sort_priority', 'title', 'version' title = content.metadata['title'] # Works with UTC datetimes timestamp = int(time.mktime(content.metadata['modified'].timetuple())) # Remove all script and style elements soup = BeautifulSoup(content.content) for script in soup(['script', 'style']): script.extract() text = soup.get_text() # TODO: variables = { 0: rating, 1: reputation, 2: visits } self.index.add_document(path, { 'text': text, 'title': title, 'timestamp' : timestamp }) def _index_pdf(self, content, path): fpath = os.path.join(self.source_root, path) if not os.path.exists(fpath): logger.error('Indexer: Cannot read pdf at %s' % fpath) return # Available metadata: 'author', 'basename_raw', 'date', # 'email', 'exported_type', 'modified', 'relative_url', # 'slug', 'source_id', 'source_type', 'summary', 'template', # 'sorted_title', 'sort_priority', 'title', 'version' title = content.metadata['title'] # Works with UTC datetimes timestamp = int(time.mktime(content.metadata['modified'].timetuple())) with open(fpath) as f: doc = slate.PDF(f) i = 0 for text in doc: i += 1 # TODO: variables = { 0: rating, 1: reputation, 2: visits } self.index.add_document(path, { 'text': text, 'title': title, 'timestamp' : timestamp }) def index_content(self, content): content_type = content.__class__.__name__ source_path = content.source_path if source_path[:1] == '/': if source_path.startswith(self.source_root): source_path = source_path[self.root_len:] else: logger.debug('skipping out-of-path content %s, source %s' % (content_type, source_path)) return if content_type == 'Article' or content_type == 'Page': self._index_html(content, source_path) elif content_type == 'Static': filename, extension = os.path.splitext(source_path) if extension == '.pdf': self._index_pdf(content, source_path) else: logger.debug('skipping unknown static type, source %s' % source_path) else: logger.debug('skipping unknown content %s, source %s' % (content_type, source_path))
import csv from indextank.client import ApiClient reader = csv.DictReader(open('US_POI_and_Business_Listings_(Beta).csv'), delimiter=',') api = ApiClient('http://:indextank-api-key') index = api.get_index('index-name') categories = set() total = 0 valid = 0 bss = {} for business in reader: if (business['state'] == 'MA'): # and total += 1 if (business['longitude'] != '' and business['latitude'] != '' and business['category'] != ''): categories.add(business['category']) bss[business['uuid']] = business valid += 1 print "Categories:", len(categories) print "Total:", total print "Valid:", valid print "Accepted percentage:", "%.2f" % ((100.0 * valid) / total) i = 0
import csv from indextank.client import ApiClient reader = csv.DictReader(open('US_POI_and_Business_Listings_(Beta).csv'), delimiter = ',') api = ApiClient('http://:indextank-api-key') index = api.get_index('index-name') categories = set() total = 0 valid = 0 bss = {} for business in reader: if(business['state'] == 'MA'):# and total += 1 if(business['longitude'] != '' and business['latitude'] != '' and business['category'] != ''): categories.add(business['category']) bss[business['uuid']] = business valid += 1 print "Categories:", len(categories) print "Total:", total print "Valid:", valid print "Accepted percentage:", "%.2f" % ((100.0 * valid) / total) i = 0
class RoutineBackend(object): def __init__(self): self.api = ApiClient('http://indxtank-api-key') self.index = self.api.get_index('index-name') self.sierpinski = Sierpinski('co.txt') @cherrypy.expose def index(self, _): return "show_search('Hello World!')" @cherrypy.expose def sort(self, lathome, lnghome, stops, cback, _=''): cherrypy.response.headers['Content-Type'] = 'application/javascript' st = stops.split(',') home = {'lat' : float(lathome), 'lng' : float(lnghome), 'uid' : 0, 'bubble' : 'home'} query = 'uuid:' bubble_template = """<h3>%(name)s</h3> <p>%(address)s, %(city)s, %(state)s</p>""" self.sierpinski.reset() self.sierpinski.add_point(home) for stop in st: re = self.index.search(query + stop, fetch_fields = ['name', 'address', 'city', 'state', 'telephone', 'website', 'latitude', 'longitude', 'docid'])['results'][0] bs = {} re['name'] = re['name'].replace("'", "") bs['bubble'] = bubble_template % re if re['telephone'] != '' : bs['bubble'] += '<p>Telephone: ' + re['telephone'] + '</p>' if re['website'] != '' : bs['bubble'] += '<p>Website: <a href="' + re['website'] + '" target="_blank">' + re['website'] + '</a></p>' bs['lat'] = float(re['latitude']) bs['lng'] = float(re['longitude']) bs['uid'] = re['docid'] self.sierpinski.add_point(bs) data = {'bss' : self.sierpinski.shortest_route()} return cback + "(" + json.dumps(data) + ");" @cherrypy.expose def search(self, nameortype, address, lat, lng, cback, _=''): cherrypy.response.headers['Content-Type'] = 'application/javascript' if(nameortype == '' and address == ''): return cback + '(' + json.dumps({'bss' : []}) + ');' query = 'name:"' + nameortype + '" OR category:"' + nameortype + '"' if(address): query = '( ' + query + ' ) AND ( address:"' + address + '" OR city:"' + address + '" )' #self.index.add_function(1, '-miles(query.var[0], query.var[1], doc.var[0], doc.var[1])') results = self.index.search(query, fetch_fields = ['name', 'address', 'city', 'state', 'latitude', 'longitude', 'docid'], variables = {0:float(lat), 1:float(lng)}, scoring_function = 1) bss = results['results'] resp = [] binfo_template = """<h3>%(name)s</h3> <p>%(address)s, %(city)s, %(state)s</p>""" bubble_template = binfo_template bubble_template += """<p class='align-right'> (<a href='javascript:add_stop("%(docid)s", " """ + binfo_template + """ ");'>Add stop</a>) </p>""" for bs in bss: bs['name'] = bs['name'].replace("'", "") bsjs = {} bsjs['bubble'] = bubble_template % bs bsjs['lat'] = bs['latitude'] bsjs['lng'] = bs['longitude'] bsjs['uid'] = bs['docid'] resp.append(bsjs) return cback + '(' + json.dumps({'bss' : resp}) + ');'
def defineIndex(): api = ApiClient('http://:[email protected]') index = api.get_index(settings.SEARCHIFY_INDEX) return({"api": api, "index": index})
class SearchIndexer: def __init__(self, index_name, source_root, api_url): self.index_name = index_name self.source_root = source_root self.root_len = len(source_root) self.api = ApiClient(api_url) index = None try: index = self.api.get_index(index_name) logger.debug('Found searchify index %s' % index_name) except: public_search_enabled = True logger.debug('Creating searchify index %s' % index_name) index = self.api.create_index(index_name, public_search_enabled) while not index.has_started(): time.sleep(0.5) logger.debug('Searchify index %s started' % index_name) if index is None: logger.fatal('Could not create or get index %s' % index_name) sys.exit(1) self.index = index def _index_html(self, content, path): if content.content is None: logger.debug('skipping html index for %s - no content' % path) return # Available metadata: 'author', 'basename_raw', 'date', # 'email', 'exported_type', 'modified', 'relative_url', # 'slug', 'source_id', 'source_type', 'summary', 'template', # 'sorted_title', 'sort_priority', 'title', 'version' title = content.metadata['title'] # Works with UTC datetimes timestamp = int(time.mktime(content.metadata['modified'].timetuple())) # Remove all script and style elements soup = BeautifulSoup(content.content) for script in soup(['script', 'style']): script.extract() text = soup.get_text() # TODO: variables = { 0: rating, 1: reputation, 2: visits } self.index.add_document(path, { 'text': text, 'title': title, 'timestamp': timestamp }) def _index_pdf(self, content, path): fpath = os.path.join(self.source_root, path) if not os.path.exists(fpath): logger.error('Indexer: Cannot read pdf at %s' % fpath) return # Available metadata: 'author', 'basename_raw', 'date', # 'email', 'exported_type', 'modified', 'relative_url', # 'slug', 'source_id', 'source_type', 'summary', 'template', # 'sorted_title', 'sort_priority', 'title', 'version' title = content.metadata['title'] # Works with UTC datetimes timestamp = int(time.mktime(content.metadata['modified'].timetuple())) with open(fpath) as f: doc = slate.PDF(f) i = 0 for text in doc: i += 1 # TODO: variables = { 0: rating, 1: reputation, 2: visits } self.index.add_document(path, { 'text': text, 'title': title, 'timestamp': timestamp }) def index_content(self, content): content_type = content.__class__.__name__ source_path = content.source_path if source_path[:1] == '/': if source_path.startswith(self.source_root): source_path = source_path[self.root_len:] else: logger.debug('skipping out-of-path content %s, source %s' % (content_type, source_path)) return if content_type == 'Article' or content_type == 'Page': self._index_html(content, source_path) elif content_type == 'Static': filename, extension = os.path.splitext(source_path) if extension == '.pdf': self._index_pdf(content, source_path) else: logger.debug('skipping unknown static type, source %s' % source_path) else: logger.debug('skipping unknown content %s, source %s' % (content_type, source_path))
def defineIndex(): api = ApiClient('http://:[email protected]') index = api.get_index(settings.SEARCHIFY_INDEX) return ({"api": api, "index": index})
#! /usr/bin/python import sys import string import re import urllib from indextank.client import ApiClient indexapi = ApiClient("http://:[email protected]") index = indexapi.get_index("techcrunch") query = "text:" + sys.argv[1] + " OR company:" + sys.argv[1] # results = index.search(query); results = index.search( query, fetch_fields=["text", "company", "homepage_url", "category_code", "number_of_employees"], snippet_fields=["category_code"], ) # results = index.search(query); # print results; # results = index.search('%s category_code:%s' % (query,category_code)) for doc in results["results"]: print "Company:" + doc["company"] print "Website:" + doc["homepage_url"].strip("\n") print "Category:" + doc["category_code"] print "Number of Employees:" + doc["number_of_employees"] print "---"