Ejemplo n.º 1
0
class SearchifyIndex():
    """ Gestor para indice searchify"""
    
    def __init__(self):
        self.api_client = ApiClient(settings.SEARCHIFY['api_url'][0])
        self.index = self.api_client.get_index(settings.SEARCHIFY['index'])
        
    def indexit(self, docs):
        logger = logging.getLogger(__name__)
        if not docs:    
            logger.error("Ningun documento para indexar")
            return False
        else:
            logger.info('Add to index %s' % str(docs))
            result = self.index.add_documents(docs)
            return result
        
    def count(self):
        return self.index.get_size()        
        
    def delete_documents(self, docs):
        if not docs:
            return None

        # Devuelvo true temporalmente hasta implementar Haystack
        return True

        # Comantado por falla en indextank al eliminar del indice.

        #status_code, response = self.index.delete_documents(docs)
        #if response:
        #    failed_documents = []
        #    for i in xrange(len(response)):
        #        if not response[i]['deleted']:
        #            failed_documents.append(response[i])
                    #TODO podria usarse el metodo individual self.delete_document para un segundo intento
    
        #    return len(failed_documents) == 0
        #else:
        #    return False

    def delete_document(self, doc):
        # Devuelvo true temporalmente hasta implementar Haystack
        return True
Ejemplo n.º 2
0
class SearchifyIndex():
    """ Gestor para indice searchify"""
    def __init__(self):
        self.api_client = ApiClient(settings.SEARCHIFY['api_url'][0])
        self.index = self.api_client.get_index(settings.SEARCHIFY['index'])

    def indexit(self, docs):
        logger = logging.getLogger(__name__)
        if not docs:
            logger.error("Ningun documento para indexar")
            return False
        else:
            logger.info('Add to index %s' % str(docs))
            result = self.index.add_documents(docs)
            return result

    def count(self):
        return self.index.get_size()

    def delete_documents(self, docs):
        if not docs:
            return None

        # Devuelvo true temporalmente hasta implementar Haystack
        return True

        # Comantado por falla en indextank al eliminar del indice.

        #status_code, response = self.index.delete_documents(docs)
        #if response:
        #    failed_documents = []
        #    for i in xrange(len(response)):
        #        if not response[i]['deleted']:
        #            failed_documents.append(response[i])
        #TODO podria usarse el metodo individual self.delete_document para un segundo intento

        #    return len(failed_documents) == 0
        #else:
        #    return False

    def delete_document(self, doc):
        # Devuelvo true temporalmente hasta implementar Haystack
        return True
Ejemplo n.º 3
0
class RoutineBackend(object):
    def __init__(self):
        self.api = ApiClient('http://indxtank-api-key')
        self.index = self.api.get_index('index-name')
        self.sierpinski = Sierpinski('co.txt')

    @cherrypy.expose
    def index(self, _):
        return "show_search('Hello World!')"

    @cherrypy.expose
    def sort(self, lathome, lnghome, stops, cback, _=''):
        cherrypy.response.headers['Content-Type'] = 'application/javascript'
        st = stops.split(',')

        home = {
            'lat': float(lathome),
            'lng': float(lnghome),
            'uid': 0,
            'bubble': 'home'
        }

        query = 'uuid:'
        bubble_template = """<h3>%(name)s</h3>
                             <p>%(address)s, %(city)s, %(state)s</p>"""

        self.sierpinski.reset()
        self.sierpinski.add_point(home)

        for stop in st:
            re = self.index.search(query + stop,
                                   fetch_fields=[
                                       'name', 'address', 'city', 'state',
                                       'telephone', 'website', 'latitude',
                                       'longitude', 'docid'
                                   ])['results'][0]

            bs = {}
            re['name'] = re['name'].replace("'", "")
            bs['bubble'] = bubble_template % re
            if re['telephone'] != '':
                bs['bubble'] += '<p>Telephone: ' + re['telephone'] + '</p>'
            if re['website'] != '':
                bs['bubble'] += '<p>Website: <a href="' + re[
                    'website'] + '" target="_blank">' + re[
                        'website'] + '</a></p>'
            bs['lat'] = float(re['latitude'])
            bs['lng'] = float(re['longitude'])
            bs['uid'] = re['docid']
            self.sierpinski.add_point(bs)

        data = {'bss': self.sierpinski.shortest_route()}
        return cback + "(" + json.dumps(data) + ");"

    @cherrypy.expose
    def search(self, nameortype, address, lat, lng, cback, _=''):
        cherrypy.response.headers['Content-Type'] = 'application/javascript'
        if (nameortype == '' and address == ''):
            return cback + '(' + json.dumps({'bss': []}) + ');'

        query = 'name:"' + nameortype + '" OR category:"' + nameortype + '"'
        if (address):
            query = '( ' + query + ' ) AND ( address:"' + address + '" OR city:"' + address + '" )'
        #self.index.add_function(1, '-miles(query.var[0], query.var[1], doc.var[0], doc.var[1])')
        results = self.index.search(query,
                                    fetch_fields=[
                                        'name', 'address', 'city', 'state',
                                        'latitude', 'longitude', 'docid'
                                    ],
                                    variables={
                                        0: float(lat),
                                        1: float(lng)
                                    },
                                    scoring_function=1)

        bss = results['results']

        resp = []

        binfo_template = """<h3>%(name)s</h3>
                            <p>%(address)s, %(city)s, %(state)s</p>"""

        bubble_template = binfo_template
        bubble_template += """<p class='align-right'>
                                 (<a href='javascript:add_stop("%(docid)s", " """ + binfo_template + """ ");'>Add stop</a>)
                              </p>"""

        for bs in bss:
            bs['name'] = bs['name'].replace("'", "")
            bsjs = {}
            bsjs['bubble'] = bubble_template % bs
            bsjs['lat'] = bs['latitude']
            bsjs['lng'] = bs['longitude']
            bsjs['uid'] = bs['docid']
            resp.append(bsjs)

        return cback + '(' + json.dumps({'bss': resp}) + ');'
Ejemplo n.º 4
0
#! /usr/bin/python
import string;
import re;
import urllib;
from indextank.client import ApiClient;
indexapi = ApiClient('http://:[email protected]');
index = indexapi.get_index('techcrunch');
f = open("companies.txt");
fw = open("intermediate.txt", 'w');
i = 0;
for line in f :
    p = re.compile(r'{"name":');
    list = p.split(line.rstrip('"\n'));
    if len(list) > 1:
        url = "http://api.crunchbase.com/v/1/company/" + list[1].rstrip('",').lstrip(' "')+".js" ;
        print url;
        print "Company:" + list[1].rstrip('",').lstrip(' "');
        companyid = list[1].rstrip('",').lstrip(' "');
        fw.write("Company:" + list[1].rstrip('",').lstrip(' "')+"    ");
        page = urllib.urlopen(url);
        buf = page.read();
        if re.search("error",buf) :
            fw.write("\n");
            continue;
        ext = re.compile('("homepage_url"):(.*)');
        homepage =ext.search(buf).group(2).strip(' "|",')+"\n";
        fw.write(ext.search(buf).group(1).strip('"') + ":" + ext.search(buf).group(2).strip(' "|",')+"        ");
        jobs = ext.search(buf).group(2).rstrip(',"')+"/jobs";
        ext = re.compile('("category_code"):(.*)');
        fw.write(ext.search(buf).group(1).strip('"') + ":" + ext.search(buf).group(2).strip(' "|",')+"    ");
        category = ext.search(buf).group(2).strip(' "|",');
Ejemplo n.º 5
0
class SearchIndexer:
  def __init__(self, index_name, source_root, api_url):
    self.index_name = index_name
    self.source_root = source_root
    self.root_len = len(source_root)
    self.api = ApiClient(api_url)
    index = None
    try:
      index = self.api.get_index(index_name)
      logger.debug('Found searchify index %s' % index_name)
    except:
      public_search_enabled = True
      logger.debug('Creating searchify index %s' % index_name)
      index = self.api.create_index(index_name, public_search_enabled)
      while not index.has_started():
        time.sleep(0.5)
      logger.debug('Searchify index %s started' % index_name)
    if index is None:
      logger.fatal('Could not create or get index %s' % index_name)
      sys.exit(1)

    self.index = index


  def _index_html(self, content, path):
    if content.content is None:
      logger.debug('skipping html index for %s - no content' % path)
      return

    # Available metadata: 'author', 'basename_raw', 'date',
    # 'email', 'exported_type', 'modified', 'relative_url',
    # 'slug', 'source_id', 'source_type', 'summary', 'template',
    # 'sorted_title', 'sort_priority', 'title', 'version' 
    title = content.metadata['title']
     # Works with UTC datetimes
    timestamp = int(time.mktime(content.metadata['modified'].timetuple()))

    # Remove all script and style elements
    soup = BeautifulSoup(content.content)
    for script in soup(['script', 'style']):
      script.extract()    
    text = soup.get_text()

    # TODO: variables = { 0: rating, 1: reputation, 2: visits }
    self.index.add_document(path, { 
      'text': text, 'title': title, 'timestamp' : timestamp })

  def _index_pdf(self, content, path):
    fpath = os.path.join(self.source_root, path)
    if not os.path.exists(fpath):
      logger.error('Indexer: Cannot read pdf at %s' % fpath)
      return

    # Available metadata: 'author', 'basename_raw', 'date',
    # 'email', 'exported_type', 'modified', 'relative_url',
    # 'slug', 'source_id', 'source_type', 'summary', 'template',
    # 'sorted_title', 'sort_priority', 'title', 'version' 
    title = content.metadata['title']
    # Works with UTC datetimes
    timestamp = int(time.mktime(content.metadata['modified'].timetuple()))

    with open(fpath) as f:
      doc = slate.PDF(f)
      i = 0
      for text in doc:
        i += 1

        # TODO: variables = { 0: rating, 1: reputation, 2: visits }
        self.index.add_document(path, { 
          'text': text, 'title': title, 'timestamp' : timestamp })

  def index_content(self, content):
    content_type = content.__class__.__name__
    source_path = content.source_path
    if source_path[:1] == '/':
      if source_path.startswith(self.source_root):
        source_path = source_path[self.root_len:]
      else:
        logger.debug('skipping out-of-path content %s, source %s' % (content_type, source_path))
        return

    if content_type == 'Article' or content_type == 'Page':
      self._index_html(content, source_path)
    elif content_type == 'Static':
      filename, extension = os.path.splitext(source_path)
      if extension == '.pdf':
        self._index_pdf(content, source_path)
      else:
        logger.debug('skipping unknown static type, source %s' % source_path)
    else:
      logger.debug('skipping unknown content %s, source %s' % (content_type, source_path))
Ejemplo n.º 6
0
import csv

from indextank.client import ApiClient

reader = csv.DictReader(open('US_POI_and_Business_Listings_(Beta).csv'),
                        delimiter=',')

api = ApiClient('http://:indextank-api-key')
index = api.get_index('index-name')

categories = set()
total = 0
valid = 0

bss = {}

for business in reader:
    if (business['state'] == 'MA'):  # and
        total += 1
        if (business['longitude'] != '' and business['latitude'] != ''
                and business['category'] != ''):
            categories.add(business['category'])
            bss[business['uuid']] = business
            valid += 1

print "Categories:", len(categories)
print "Total:", total
print "Valid:", valid
print "Accepted percentage:", "%.2f" % ((100.0 * valid) / total)

i = 0
Ejemplo n.º 7
0
import csv

from indextank.client import ApiClient

reader = csv.DictReader(open('US_POI_and_Business_Listings_(Beta).csv'),
                        delimiter = ',')

api = ApiClient('http://:indextank-api-key')
index = api.get_index('index-name')

categories = set()
total = 0
valid = 0

bss = {}

for business in reader:
    if(business['state'] == 'MA'):# and
        total += 1
        if(business['longitude'] != '' and business['latitude'] != '' and business['category'] != ''):
            categories.add(business['category'])
            bss[business['uuid']] = business
            valid += 1

print "Categories:", len(categories)
print "Total:", total
print "Valid:", valid
print "Accepted percentage:", "%.2f" % ((100.0 * valid) / total)


i = 0
Ejemplo n.º 8
0
class RoutineBackend(object):

    def __init__(self):
        self.api = ApiClient('http://indxtank-api-key')
        self.index = self.api.get_index('index-name')
        self.sierpinski = Sierpinski('co.txt')

    @cherrypy.expose
    def index(self, _):
        return "show_search('Hello World!')"

    @cherrypy.expose
    def sort(self, lathome, lnghome, stops, cback, _=''):
        cherrypy.response.headers['Content-Type'] = 'application/javascript'
        st = stops.split(',')

        home = {'lat' : float(lathome),
                'lng' : float(lnghome),
                'uid' : 0,
                'bubble' : 'home'}

        query = 'uuid:'
        bubble_template = """<h3>%(name)s</h3>
                             <p>%(address)s, %(city)s, %(state)s</p>"""

        self.sierpinski.reset()
        self.sierpinski.add_point(home)

        for stop in st:
            re = self.index.search(query + stop,
                                   fetch_fields = ['name', 'address', 'city', 'state', 'telephone', 'website',
                                                   'latitude', 'longitude', 'docid'])['results'][0]

            bs = {}
            re['name'] = re['name'].replace("'", "")
            bs['bubble'] = bubble_template % re
            if re['telephone'] != '' :
                bs['bubble'] += '<p>Telephone: ' + re['telephone'] + '</p>'
            if re['website'] != '' :
                bs['bubble'] += '<p>Website: <a href="' + re['website'] + '" target="_blank">' + re['website'] + '</a></p>'
            bs['lat'] = float(re['latitude'])
            bs['lng'] = float(re['longitude'])
            bs['uid'] = re['docid']
            self.sierpinski.add_point(bs)

        data = {'bss' : self.sierpinski.shortest_route()}
        return cback + "(" + json.dumps(data) + ");"

    @cherrypy.expose
    def search(self, nameortype, address, lat, lng, cback, _=''):
        cherrypy.response.headers['Content-Type'] = 'application/javascript'
        if(nameortype == '' and address == ''):
            return cback + '(' + json.dumps({'bss' : []}) + ');'

        query = 'name:"' + nameortype + '" OR category:"' + nameortype + '"'
        if(address): query = '( ' + query + ' ) AND ( address:"' + address + '" OR city:"' + address + '" )'
        #self.index.add_function(1, '-miles(query.var[0], query.var[1], doc.var[0], doc.var[1])')
        results = self.index.search(query,
                                    fetch_fields = ['name', 'address', 'city', 'state', 'latitude', 'longitude', 'docid'],
                                    variables = {0:float(lat), 1:float(lng)},
                                    scoring_function = 1)

        bss = results['results']

        resp = []

        binfo_template = """<h3>%(name)s</h3>
                            <p>%(address)s, %(city)s, %(state)s</p>"""


        bubble_template = binfo_template
        bubble_template += """<p class='align-right'>
                                 (<a href='javascript:add_stop("%(docid)s", " """ + binfo_template + """ ");'>Add stop</a>)
                              </p>"""

        for bs in bss:
            bs['name'] = bs['name'].replace("'", "")
            bsjs = {}
            bsjs['bubble'] = bubble_template % bs
            bsjs['lat'] = bs['latitude']
            bsjs['lng'] = bs['longitude']
            bsjs['uid'] = bs['docid']
            resp.append(bsjs)

        return cback + '(' + json.dumps({'bss' : resp}) + ');'
Ejemplo n.º 9
0
def defineIndex():
    api = ApiClient('http://:[email protected]')
    index = api.get_index(settings.SEARCHIFY_INDEX)
    return({"api": api, "index": index})
Ejemplo n.º 10
0
class SearchIndexer:
    def __init__(self, index_name, source_root, api_url):
        self.index_name = index_name
        self.source_root = source_root
        self.root_len = len(source_root)
        self.api = ApiClient(api_url)
        index = None
        try:
            index = self.api.get_index(index_name)
            logger.debug('Found searchify index %s' % index_name)
        except:
            public_search_enabled = True
            logger.debug('Creating searchify index %s' % index_name)
            index = self.api.create_index(index_name, public_search_enabled)
            while not index.has_started():
                time.sleep(0.5)
            logger.debug('Searchify index %s started' % index_name)
        if index is None:
            logger.fatal('Could not create or get index %s' % index_name)
            sys.exit(1)

        self.index = index

    def _index_html(self, content, path):
        if content.content is None:
            logger.debug('skipping html index for %s - no content' % path)
            return

        # Available metadata: 'author', 'basename_raw', 'date',
        # 'email', 'exported_type', 'modified', 'relative_url',
        # 'slug', 'source_id', 'source_type', 'summary', 'template',
        # 'sorted_title', 'sort_priority', 'title', 'version'
        title = content.metadata['title']
        # Works with UTC datetimes
        timestamp = int(time.mktime(content.metadata['modified'].timetuple()))

        # Remove all script and style elements
        soup = BeautifulSoup(content.content)
        for script in soup(['script', 'style']):
            script.extract()
        text = soup.get_text()

        # TODO: variables = { 0: rating, 1: reputation, 2: visits }
        self.index.add_document(path, {
            'text': text,
            'title': title,
            'timestamp': timestamp
        })

    def _index_pdf(self, content, path):
        fpath = os.path.join(self.source_root, path)
        if not os.path.exists(fpath):
            logger.error('Indexer: Cannot read pdf at %s' % fpath)
            return

        # Available metadata: 'author', 'basename_raw', 'date',
        # 'email', 'exported_type', 'modified', 'relative_url',
        # 'slug', 'source_id', 'source_type', 'summary', 'template',
        # 'sorted_title', 'sort_priority', 'title', 'version'
        title = content.metadata['title']
        # Works with UTC datetimes
        timestamp = int(time.mktime(content.metadata['modified'].timetuple()))

        with open(fpath) as f:
            doc = slate.PDF(f)
            i = 0
            for text in doc:
                i += 1

                # TODO: variables = { 0: rating, 1: reputation, 2: visits }
                self.index.add_document(path, {
                    'text': text,
                    'title': title,
                    'timestamp': timestamp
                })

    def index_content(self, content):
        content_type = content.__class__.__name__
        source_path = content.source_path
        if source_path[:1] == '/':
            if source_path.startswith(self.source_root):
                source_path = source_path[self.root_len:]
            else:
                logger.debug('skipping out-of-path content %s, source %s' %
                             (content_type, source_path))
                return

        if content_type == 'Article' or content_type == 'Page':
            self._index_html(content, source_path)
        elif content_type == 'Static':
            filename, extension = os.path.splitext(source_path)
            if extension == '.pdf':
                self._index_pdf(content, source_path)
            else:
                logger.debug('skipping unknown static type, source %s' %
                             source_path)
        else:
            logger.debug('skipping unknown content %s, source %s' %
                         (content_type, source_path))
Ejemplo n.º 11
0
def defineIndex():
    api = ApiClient('http://:[email protected]')
    index = api.get_index(settings.SEARCHIFY_INDEX)
    return ({"api": api, "index": index})
Ejemplo n.º 12
0
#! /usr/bin/python
import sys
import string
import re
import urllib
from indextank.client import ApiClient

indexapi = ApiClient("http://:[email protected]")
index = indexapi.get_index("techcrunch")
query = "text:" + sys.argv[1] + " OR company:" + sys.argv[1]
# results = index.search(query);
results = index.search(
    query,
    fetch_fields=["text", "company", "homepage_url", "category_code", "number_of_employees"],
    snippet_fields=["category_code"],
)
# results = index.search(query);
# print results;
# results = index.search('%s category_code:%s' % (query,category_code))
for doc in results["results"]:
    print "Company:" + doc["company"]
    print "Website:" + doc["homepage_url"].strip("\n")
    print "Category:" + doc["category_code"]
    print "Number of Employees:" + doc["number_of_employees"]
    print "---"