Ejemplo n.º 1
0
def jobs(request):
    # variables declared with HTTP GET, otherwise defaults provided
    job = request.GET.get('job', 'java')
    location = request.GET.get('location', 'california')

    # TODO - These should be found in the HTTP request headers
    user_ip = request.GET.get('user_ip', '11.22.33.44')
    url = request.GET.get('url', 'http://www.example.com/jobsearch?q=python&l=london')
    user_agent = request.GET.get('user_agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0')

    # Look for keywords in database
    results = SearchResult.objects.filter(job=job, location=location)

    # If length of results is 0
    if len(results) == 0:
        all_jobs = indeed_query(job, location, user_ip, user_agent)
        new_result = SearchResult(job=job,
                                  location=location,
                                  result=json.dumps(all_jobs, sort_keys=True, indent=0, separators=(',', ': ')))

        new_result.save()
        results = [new_result]

    return JsonResponse({
        'jobs': json.loads(results[0].result)
    })
Ejemplo n.º 2
0
def crawl(pixiv_id, password, cron):
    words = [w for w in Word.select().order_by(Word.id)]
    crawler = PixivCrawler()
    crawler.login(pixiv_id, password)
    for word in words:
        data = {"word": word, "stored_at": date.today()}
        try:
            r = SearchResult.get(word=data["word"],
                                 stored_at=data["stored_at"])
            if not cron:
                click.echo(
                    f"【登録済】{word.text} - safe: {r.num_of_safe}件 / r18: {r.num_of_r18}件"
                )
        except SearchResult.DoesNotExist:
            for mode in ("safe", "r18"):
                crawler.search(word.text, mode)
                if mode == "safe":
                    data["safe"] = crawler.get_search_count()
                else:
                    data["r18"] = crawler.get_search_count()
                time.sleep(3)

            SearchResult.create(
                word=data["word"],
                stored_at=data["stored_at"],
                num_of_safe=data["safe"],
                num_of_r18=data["r18"],
            )
            if not cron:
                click.echo(
                    f"{word.text} - safe: {data['safe']}件 / r18: {data['r18']}件"
                )
Ejemplo n.º 3
0
def search_google(request):
	if(request.method==GET):
		keywords = request.GET["keywords"]
		response = gcs.search(q=keywords)
		
		SearchString.objects.filter(words=keywords).delete()
		searchstring = SearchString(words=keywords)
		searchstring.save()

		for item in response['items']:
			searchresult = SearchResult(searchstring=searchstring, result_title=item['htmlTitle'], result_url=item['link'])
			searchresult.save()

		return JsonResponse({'search_id': searchstring.id , 'data':response})
Ejemplo n.º 4
0
def get_search_results(query_id, query_postings):
    # Check previously calculated queries for changes in the corpus
    query = Query.objects(id=query_id).only("results",
                                            "total_frequency").first()
    total_frequency = Entry.objects(id__in=query_postings.iterkeys()).only(
        "total_frequency").sum("total_frequency")
    if not query or total_frequency != query.total_frequency:
        results = []
        avg_length = Meme.objects.only("length").aggregate_average("length")
        idf, relevant_docs = get_idf_relevant_docs(query_postings)
        for meme in relevant_docs:  # Iterate through relevant documents to calculate its score
            bm25 = calculate_bm25(avg_length, idf, meme, query_postings)
            result = SearchResult(id=MemeId(source=meme.id.source,
                                            meme_id=meme.id.meme_id),
                                  name=meme.name,
                                  title=meme.title,
                                  caption=meme.caption,
                                  score=meme.score,
                                  url=meme.url,
                                  image=meme.image,
                                  bm25=bm25)
            results.append(result)
        results = sorted(results, key=lambda result: result.bm25,
                         reverse=True)[:200]
        query = Query(id=query_id,
                      results=results,
                      total_frequency=total_frequency)
        query.save()
    return query.results
Ejemplo n.º 5
0
    def process_list(self):

        for word in self.list:
            try:
                result = self.grab_tips(word)
                task = self.session.query(Task).get(self.task_id)
                if result:
                    result = ''.join(res + ',' for res in result)
                    result = result[0:len(result) - 1]
                else:
                    result = ''
                task.last_word = word
                res = SearchResult(task_id=task.id, word=word, result=result)
                self.session.add(res)
                self.session.commit()
                print(f'[+] Search completed for <{word}>')
                if result:
                    for w in result.split(','):
                        print(f'\t+ {w}')
                else:
                    print('\t ! Not results found')
            except Exception:
                print(f'[!] An exception occured. Skiping <{word}>')

        self.session.close()
Ejemplo n.º 6
0
def main(*args, **kwargs):

    # Create a new fetch index.
    last_fetch_index = WebPageVersion.select(fn.Max(WebPageVersion.fetch_index)).scalar() or 0
    fetch_index = last_fetch_index + 1
    search_results = SearchResult.select(SearchResult.url).distinct()
    for search_result in search_results:
        get_history(search_result.url, fetch_index)
Ejemplo n.º 7
0
 def _parse_search(self, json_obj):
     result = SearchResult()
     if 'artists' in json_obj:
         result.artists = [
             self._parse_artist(json)
             for json in json_obj['artists']['items']
         ]
     if 'albums' in json_obj:
         result.albums = [
             self._parse_album(json) for json in json_obj['albums']['items']
         ]
     if 'tracks' in json_obj:
         result.tracks = [
             self._parse_track(json) for json in json_obj['tracks']['items']
         ]
     if 'playlists' in json_obj:
         result.playlists = [
             self._parse_playlist(json)
             for json in json_obj['playlists']['items']
         ]
     if 'videos' in json_obj:
         result.videos = [
             self._parse_video(json) for json in json_obj['videos']['items']
         ]
     return result
Ejemplo n.º 8
0
def results(request):
    requested_query = request.GET['request']
    results = []
    query = q.free_query(requested_query)
    ranked_query = q.simple_ranked_results(query)
    urls = {}
    for url in database.get_urls():
        urls[url[0]] = url[1]
    title = ""
    for link in ranked_query:
        if link[0] in urls:
            title = urls[link[0]]
        results.append(SearchResult(site_title=title, site_link=link[0]))
    return render(request, 'results.html', {
        'results': results,
        'query': requested_query
    })
Ejemplo n.º 9
0
    def saveResult(self, tweet, query):
        try:
            sr = SearchResult()
            sr.screenName = tweet.user.screen_name
            sr.userId = tweet.user.id
            sr.tweetId = tweet.id
            #sr.text = tweet.text
            sr.query = query
            #sr.tweetedTime = tweet.created_at
            sr.lang = tweet.lang
            sr.save()

        except Exception as e:
            if e[0] == 1366:
                log.info("Error no 1366, removing text")
                sr.text = None
                sr.save()
            else:
                raise e
Ejemplo n.º 10
0
    def saveResult(self, tweet, query):
        try:
            sr = SearchResult()
            sr.screenName = tweet.user.screen_name
            sr.userId = tweet.user.id
            sr.tweetId = tweet.id
            #sr.text = tweet.text
            sr.query=query
            #sr.tweetedTime = tweet.created_at
            sr.lang = tweet.lang
            sr.save()

        except Exception as e:
            if e[0] == 1366:
                log.info("Error no 1366, removing text")
                sr.text=None
                sr.save()
            else:
                raise e
Ejemplo n.º 11
0
def get_results(query, package, include_stack_overflow, fetch_index, search_id, api_key):

    # Make request for search results
    params = DEFAULT_PARAMS.copy()
    params['key'] = api_key
    params['cx'] = search_id
    params['q'] = query
    if not include_stack_overflow:
        params['siteSearch'] = 'stackoverflow.com'
        params['siteSearchFilter'] = 'e'  # 'e' for 'exclude'
    response = make_request(default_requests_session.get, SEARCH_URL, params=params)

    # Pause so that we don't bombard the server with requests
    time.sleep(REQUEST_DELAY)

    # If request resulted in error, the response is null.  Skip over this query.
    if response is None:
        return

    # Parse search results
    soup = BeautifulSoup(response.content, 'html.parser')
    url = soup.find('opensearch:Url')
    entry_count = len(soup.find_all('entry'))

    # The Atom spec for the search API
    # (https://developers.google.com/custom-search/json-api/v1/reference/cse/list#response)
    # mentions that the estimated results count may be a long integer.
    # To my knowledge, peewee (our ORM) doesn't support long integer fields.
    # So, I cast this to an integer instead and cross my fingers there is no overflow.
    search = Search.create(
        fetch_index=fetch_index,
        query=query,
        page_index=0,
        requested_count=REQUESTED_RESULT_COUNT,
        result_count_on_page=entry_count,
        estimated_results_count=int(
            soup.find('cse:searchinformation').find('cse:totalresults').text),
        package=package,
    )

    # Fetch the first "entry" or search result
    entry = soup.entry

    # Save all of the search results from first to last.
    # Maintaining consistency with our query scraping, ranking starts at 1.
    for rank in range(1, entry_count + 1):

        # Extract fields from the entry
        updated_datetime_without_milliseconds = re.sub('\.\d\d\dZ', 'Z', entry.updated.text)
        updated_datetime = datetime.datetime.strptime(
            updated_datetime_without_milliseconds,
            "%Y-%m-%dT%H:%M:%SZ"
        )
        link = entry.link['href']
        snippet = entry.summary.string
        title = entry.title.text
        url = entry.id.text

        # Create a record for this search result
        SearchResult.create(
            search=search,
            title=title,
            snippet=snippet,
            link=link,
            url=url,
            updated_date=updated_datetime,
            rank=rank,
        )

        # To my knowledge, this is the only method for which it is strongly implied in
        # the BeautifulSoup documentation that you are fetching the next result
        # in the sequence.  I also assume that the search API is returning results
        # in the order of decreasing relevance, such that rank increases (gets bigger)
        # with each successive entry visited.
        entry = entry.find_next('entry')
Ejemplo n.º 12
0
    with open('results.pickle', 'wb') as f:
        p = cPickle.Pickler(f)
        p.dump(sr)


if __name__ == '__main__':
    with open('queries.txt', 'r') as f:
        queries = f.readlines()
    random.shuffle(queries)
    search_results = []
    driver = webdriver.Firefox()
    for query in queries:
        if not query:
            continue
        src = get_text_with_query(driver, query)
        sr = SearchResult()
        sr.results += parse(src)
        sr.query = query

        time.sleep(WAIT)

        more_pages = parse_navigation(src)
        for p in more_pages:
            path = p.get('url')
            t = get_text(driver, BASE_URL + path)
            time.sleep(WAIT)
            buff = parse(t)
            sr.results += buff
        search_results.append(sr)

    driver.close()
Ejemplo n.º 13
0
    with open('results.pickle', 'wb') as f:
        p = cPickle.Pickler(f)
        p.dump(sr)


if __name__ == '__main__':
    with open('queries.txt', 'r') as f:
        queries = f.readlines()
    random.shuffle(queries)
    search_results = []
    driver = webdriver.Firefox()
    for query in queries:
        if not query:
            continue
        src = get_text_with_query(driver, query)
        sr = SearchResult()
        sr.results += parse(src)
        sr.query = query

        time.sleep(WAIT)

        more_pages = parse_navigation(src)
        for p in more_pages:
            path = p.get('url')
            t = get_text(driver, BASE_URL+path)
            time.sleep(WAIT)
            buff = parse(t)
            sr.results += buff
        search_results.append(sr)

    driver.close()