Example #1
0
    def searchAllSource(self, query, timeout=None):
        if query.kinds is not None and len(query.kinds) > 0 and len(self.kinds.intersection(query.kinds)) == 0:
            logs.debug('Skipping %s (kinds: %s)' % (self.sourceName, query.kinds))
            return self.emptySource

        logs.debug('Searching %s...' % self.sourceName)
            
        def gen():
            try:
                raw_results = []

                def getFactualSearch(q, useLocation=False):
                    if useLocation and q.coordinates is not None:
                        results = self.__factual.search(q.query_string, coordinates=q.coordinates)
                    else:
                        results = self.__factual.search(q.query_string)
                    for result in results:
                        raw_results.append(result)

                if query.coordinates is not None:
                    pool = Pool(2)
                    pool.spawn(getFactualSearch, query, False)
                    pool.spawn(getFactualSearch, query, True)
                    pool.join(timeout=timeout)
                else:
                    raw_results = getFactualSearch(query)

                if raw_results is not None:
                    for result in raw_results:
                        yield FactualPlace(data=result)
            except GeneratorExit:
                pass
        return generatorSource(gen(), constructor=FactualSearchAll)
 def validate(self, results):
     """
         Validates the search result set to ensure that there are no obvious 
         duplicate results.
         
         Returns True if all results are unique within a fuzzy margin of error 
         or False otherwise.
     """
     
     proxies = map(self.__stamped.proxyFromEntity, results)
     
     # ensure that no result resolves definitively to any other result in the result set
     for i in xrange(len(proxies)):
         proxy = proxies[i]
         
         def dedup():
             for j in xrange(len(proxies)):
                 proxy2 = proxies[j]
                 
                 if i != j and proxy.kind == proxy2.kind:
                     yield proxy2
         
         dups = self.__resolver.resolve(proxy, generatorSource(dedup()), count=1)
         
         if len(dups) > 0 and dups[0][0]['resolved']:
             return False
     
     seen = defaultdict(set)
     
     # ensure that there are no obvious duplicate results without using the resolver
     for i in xrange(len(results)):
         result = results[i]
         keys   = [ k for k in result.sources if k.endswith('_id') ]
         
         # ensure that the same source id doesn't appear twice in the result set
         # (source ids are supposed to be unique)
         for key in keys:
             value = str(result[key])
             
             if value in seen[key]:
                 return False
             
             seen[key].add(value)
         
         for j in xrange(i + 1, len(results)):
             result2 = results[j]
             
             if i != j and self._eq(result.kind, result2.kind) and self._eq(result.title, result2.title):
                 if len(result.types.intersection(result2.types)) > 0:
                     utils.log("")
                     utils.log("!" * 80)
                     utils.log("dupe encountered: %s\n%s" % (result, result2))
                     utils.log("!" * 80)
                     utils.log("")
                     
                     return False
     
     return True
Example #3
0
 def placeSource(self, query):
     def gen():
         try:
             results = self.__factual.search(query.name)
             for result in results:
                 yield FactualPlace(data=result)
         except GeneratorExit:
             pass
     return generatorSource(gen())
Example #4
0
    def search(self, 
               query, 
               coords   = None, 
               full     = True, 
               local    = False, 
               kinds    = None,
               types    = None, 
               offset   = 0, 
               limit    = 10):
        
        before  = time.time()
        query   = QuerySearchAll(query, coords, kinds, types, local)
        pool    = Pool(len(self._sources))
        results = []
        timeout = 6.5
        
        # NOTE: order is important here; e.g., we want to give precedence to 
        # certain third-party APIs to begin their requests before others.
        for source in self._sources:
            if not full and source.sourceName != 'stamped':
                # ignore any external sources if full search is disabled
                continue

            # TODO: Make sure timeout gets passed through to source member functions.
            pool.spawn(self.__search_helper, query, limit, offset, source, results, timeout=timeout)
        
        pool.join(timeout=timeout)
        
        all_results = {}
        total = 0
        
        for source_name, result in results:
            if query.kinds is None or result[1].target.kind in query.kinds:
                if query.types is None or len(query.types.intersection(result[1].target.types)) > 0:
                    source_results = all_results.setdefault(source_name,[])
                    source_results.append(result)
                    total += 1
                else:
                    logs.debug("Filtered out %s (types=%s)" % 
                              (result[1].name, result[1].target.types))
            else:
                logs.debug("Filtered out %s (kind=%s)" % 
                          (result[1].name, result[1].target.kind))
        
        for source_name, source_results in all_results.iteritems():
            all_results[source_name] = sortedResults(source_results)
        
        print("\n\n\nGenerated %d results in %f seconds from: %s\n\n\n" % (
            total, time.time() - before, ' '.join([ '%s:%s' % (k, len(v)) for k,v in all_results.iteritems()])
        ))
        
        before2 = time.time()
        chosen  = []
        limit   = max(0, min(total, limit if limit else total))
        
        while len(chosen) < limit:
            best_name = None
            best = None
            
            for name, source_results in list(all_results.iteritems()):
                if len(source_results) == 0:
                    del all_results[name]
                else:
                    cur_best = source_results[0]
                    if best is None or cur_best[0]['total'] > best[0]['total']:
                        best = cur_best
                        best_name = name
                    else:
                        if _verbose:
                            print("skipped %s with value %s" % (name, cur_best[0]['total']))
            
            if best is not None:
                del all_results[best_name][0]
                
                if _verbose:
                    print("Chose %s with value %s" % (best_name, best[0]['total']))
                cur = best[1]
                
                def dedup():
                    for entry in chosen:
                        target = entry[1].target
                        if target.types == cur.target.types:
                            yield target
                
                dups = self.__resolver.resolve(cur.target, generatorSource(dedup()), count=1)
                
                if len(dups) > 0 and dups[0][0]['resolved']:
                    if _verbose:
                        print("Discarded %s:%s as a duplicate to %s:%s" % 
                              (cur.source, cur.name, dups[0][1].source, dups[0][1].name))
                        
                        print(formatResults(dups[0:1], verbose=True))
                else:
                    chosen.append(best)
                    
                    """# useful debugging aid if you find dupes in the search results
                    if len(best) == 2 and len(dups) > 0:
                        print("COMPARED %s:%s with %s:%s" % 
                              (cur.source, cur.name, dups[0][1].source, dups[0][1].name))
                        
                        print(formatResults(dups[0:2], verbose=True))
                    """
            else:
                break
        
        if _verbose:
            print("\n\n\nDeduped %d results in %f seconds\n\n\n" % (total - len(chosen), time.time() - before2))
        
        return chosen