def jobs(request): # variables declared with HTTP GET, otherwise defaults provided job = request.GET.get('job', 'java') location = request.GET.get('location', 'california') # TODO - These should be found in the HTTP request headers user_ip = request.GET.get('user_ip', '11.22.33.44') url = request.GET.get('url', 'http://www.example.com/jobsearch?q=python&l=london') user_agent = request.GET.get('user_agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0') # Look for keywords in database results = SearchResult.objects.filter(job=job, location=location) # If length of results is 0 if len(results) == 0: all_jobs = indeed_query(job, location, user_ip, user_agent) new_result = SearchResult(job=job, location=location, result=json.dumps(all_jobs, sort_keys=True, indent=0, separators=(',', ': '))) new_result.save() results = [new_result] return JsonResponse({ 'jobs': json.loads(results[0].result) })
def crawl(pixiv_id, password, cron): words = [w for w in Word.select().order_by(Word.id)] crawler = PixivCrawler() crawler.login(pixiv_id, password) for word in words: data = {"word": word, "stored_at": date.today()} try: r = SearchResult.get(word=data["word"], stored_at=data["stored_at"]) if not cron: click.echo( f"【登録済】{word.text} - safe: {r.num_of_safe}件 / r18: {r.num_of_r18}件" ) except SearchResult.DoesNotExist: for mode in ("safe", "r18"): crawler.search(word.text, mode) if mode == "safe": data["safe"] = crawler.get_search_count() else: data["r18"] = crawler.get_search_count() time.sleep(3) SearchResult.create( word=data["word"], stored_at=data["stored_at"], num_of_safe=data["safe"], num_of_r18=data["r18"], ) if not cron: click.echo( f"{word.text} - safe: {data['safe']}件 / r18: {data['r18']}件" )
def search_google(request): if(request.method==GET): keywords = request.GET["keywords"] response = gcs.search(q=keywords) SearchString.objects.filter(words=keywords).delete() searchstring = SearchString(words=keywords) searchstring.save() for item in response['items']: searchresult = SearchResult(searchstring=searchstring, result_title=item['htmlTitle'], result_url=item['link']) searchresult.save() return JsonResponse({'search_id': searchstring.id , 'data':response})
def get_search_results(query_id, query_postings): # Check previously calculated queries for changes in the corpus query = Query.objects(id=query_id).only("results", "total_frequency").first() total_frequency = Entry.objects(id__in=query_postings.iterkeys()).only( "total_frequency").sum("total_frequency") if not query or total_frequency != query.total_frequency: results = [] avg_length = Meme.objects.only("length").aggregate_average("length") idf, relevant_docs = get_idf_relevant_docs(query_postings) for meme in relevant_docs: # Iterate through relevant documents to calculate its score bm25 = calculate_bm25(avg_length, idf, meme, query_postings) result = SearchResult(id=MemeId(source=meme.id.source, meme_id=meme.id.meme_id), name=meme.name, title=meme.title, caption=meme.caption, score=meme.score, url=meme.url, image=meme.image, bm25=bm25) results.append(result) results = sorted(results, key=lambda result: result.bm25, reverse=True)[:200] query = Query(id=query_id, results=results, total_frequency=total_frequency) query.save() return query.results
def process_list(self): for word in self.list: try: result = self.grab_tips(word) task = self.session.query(Task).get(self.task_id) if result: result = ''.join(res + ',' for res in result) result = result[0:len(result) - 1] else: result = '' task.last_word = word res = SearchResult(task_id=task.id, word=word, result=result) self.session.add(res) self.session.commit() print(f'[+] Search completed for <{word}>') if result: for w in result.split(','): print(f'\t+ {w}') else: print('\t ! Not results found') except Exception: print(f'[!] An exception occured. Skiping <{word}>') self.session.close()
def main(*args, **kwargs): # Create a new fetch index. last_fetch_index = WebPageVersion.select(fn.Max(WebPageVersion.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 search_results = SearchResult.select(SearchResult.url).distinct() for search_result in search_results: get_history(search_result.url, fetch_index)
def _parse_search(self, json_obj): result = SearchResult() if 'artists' in json_obj: result.artists = [ self._parse_artist(json) for json in json_obj['artists']['items'] ] if 'albums' in json_obj: result.albums = [ self._parse_album(json) for json in json_obj['albums']['items'] ] if 'tracks' in json_obj: result.tracks = [ self._parse_track(json) for json in json_obj['tracks']['items'] ] if 'playlists' in json_obj: result.playlists = [ self._parse_playlist(json) for json in json_obj['playlists']['items'] ] if 'videos' in json_obj: result.videos = [ self._parse_video(json) for json in json_obj['videos']['items'] ] return result
def results(request): requested_query = request.GET['request'] results = [] query = q.free_query(requested_query) ranked_query = q.simple_ranked_results(query) urls = {} for url in database.get_urls(): urls[url[0]] = url[1] title = "" for link in ranked_query: if link[0] in urls: title = urls[link[0]] results.append(SearchResult(site_title=title, site_link=link[0])) return render(request, 'results.html', { 'results': results, 'query': requested_query })
def saveResult(self, tweet, query): try: sr = SearchResult() sr.screenName = tweet.user.screen_name sr.userId = tweet.user.id sr.tweetId = tweet.id #sr.text = tweet.text sr.query = query #sr.tweetedTime = tweet.created_at sr.lang = tweet.lang sr.save() except Exception as e: if e[0] == 1366: log.info("Error no 1366, removing text") sr.text = None sr.save() else: raise e
def saveResult(self, tweet, query): try: sr = SearchResult() sr.screenName = tweet.user.screen_name sr.userId = tweet.user.id sr.tweetId = tweet.id #sr.text = tweet.text sr.query=query #sr.tweetedTime = tweet.created_at sr.lang = tweet.lang sr.save() except Exception as e: if e[0] == 1366: log.info("Error no 1366, removing text") sr.text=None sr.save() else: raise e
def get_results(query, package, include_stack_overflow, fetch_index, search_id, api_key): # Make request for search results params = DEFAULT_PARAMS.copy() params['key'] = api_key params['cx'] = search_id params['q'] = query if not include_stack_overflow: params['siteSearch'] = 'stackoverflow.com' params['siteSearchFilter'] = 'e' # 'e' for 'exclude' response = make_request(default_requests_session.get, SEARCH_URL, params=params) # Pause so that we don't bombard the server with requests time.sleep(REQUEST_DELAY) # If request resulted in error, the response is null. Skip over this query. if response is None: return # Parse search results soup = BeautifulSoup(response.content, 'html.parser') url = soup.find('opensearch:Url') entry_count = len(soup.find_all('entry')) # The Atom spec for the search API # (https://developers.google.com/custom-search/json-api/v1/reference/cse/list#response) # mentions that the estimated results count may be a long integer. # To my knowledge, peewee (our ORM) doesn't support long integer fields. # So, I cast this to an integer instead and cross my fingers there is no overflow. search = Search.create( fetch_index=fetch_index, query=query, page_index=0, requested_count=REQUESTED_RESULT_COUNT, result_count_on_page=entry_count, estimated_results_count=int( soup.find('cse:searchinformation').find('cse:totalresults').text), package=package, ) # Fetch the first "entry" or search result entry = soup.entry # Save all of the search results from first to last. # Maintaining consistency with our query scraping, ranking starts at 1. for rank in range(1, entry_count + 1): # Extract fields from the entry updated_datetime_without_milliseconds = re.sub('\.\d\d\dZ', 'Z', entry.updated.text) updated_datetime = datetime.datetime.strptime( updated_datetime_without_milliseconds, "%Y-%m-%dT%H:%M:%SZ" ) link = entry.link['href'] snippet = entry.summary.string title = entry.title.text url = entry.id.text # Create a record for this search result SearchResult.create( search=search, title=title, snippet=snippet, link=link, url=url, updated_date=updated_datetime, rank=rank, ) # To my knowledge, this is the only method for which it is strongly implied in # the BeautifulSoup documentation that you are fetching the next result # in the sequence. I also assume that the search API is returning results # in the order of decreasing relevance, such that rank increases (gets bigger) # with each successive entry visited. entry = entry.find_next('entry')
with open('results.pickle', 'wb') as f: p = cPickle.Pickler(f) p.dump(sr) if __name__ == '__main__': with open('queries.txt', 'r') as f: queries = f.readlines() random.shuffle(queries) search_results = [] driver = webdriver.Firefox() for query in queries: if not query: continue src = get_text_with_query(driver, query) sr = SearchResult() sr.results += parse(src) sr.query = query time.sleep(WAIT) more_pages = parse_navigation(src) for p in more_pages: path = p.get('url') t = get_text(driver, BASE_URL + path) time.sleep(WAIT) buff = parse(t) sr.results += buff search_results.append(sr) driver.close()
with open('results.pickle', 'wb') as f: p = cPickle.Pickler(f) p.dump(sr) if __name__ == '__main__': with open('queries.txt', 'r') as f: queries = f.readlines() random.shuffle(queries) search_results = [] driver = webdriver.Firefox() for query in queries: if not query: continue src = get_text_with_query(driver, query) sr = SearchResult() sr.results += parse(src) sr.query = query time.sleep(WAIT) more_pages = parse_navigation(src) for p in more_pages: path = p.get('url') t = get_text(driver, BASE_URL+path) time.sleep(WAIT) buff = parse(t) sr.results += buff search_results.append(sr) driver.close()