def __init__(self, *args, **kwargs): super(SiteDataForm, self).__init__(*args, **kwargs) if self._request: group = get_group(self._request) qs = self.fields['category'].queryset self.fields['category'].queryset = qs.filter( group__in=(group, )).distinct()
def __init__(self, *args, **kwargs): super(SearchForm, self).__init__(*args, **kwargs) if self._request: group = get_group(self._request) qs = self.fields['engine'].queryset self.fields['engine'].queryset = qs.filter( group__in=(group, )).distinct()
def __init__(self, *args, **kwargs): super(SiteDataForm , self).__init__(*args, **kwargs) if self._request: group = get_group(self._request) qs = self.fields['category'].queryset self.fields['category'].queryset = qs.filter(group__in=(group,)).distinct()
def scrap_info_edit_do(request, *args, **kwargs): #page = request.GET.get('page','1') group = get_group(request) siteid = request.POST.get('siteid') productid = request.POST.get('productid') scraperid = request.POST.get('scraperid') sitedataid = request.POST.get('sitedataid') descriptors = KeyValueDescriptor.objects.filter(active=True, group__in=(group,)) product = Product.objects.filter(id=productid, group__in=(group,)) site = SiteData.objects.filter(id=siteid, group__in=(group,)) sitedata = SiteData.objects.filter(id=sitedataid, group__in=(group,)) scraper = ScraperDescriptor.objects.filter(id=scraperid) if sitedata.exists(): descriptors = descriptors.filter(target__symbol='SiteInfo') else: descriptors = descriptors.filter(target__symbol='ProductInfo') kvs = [] for d in descriptors: if d.symbol in request.POST: if site and d.symbol == 'URL_PROD': continue if sitedata.exists() and d.symbol == 'URL_SHOP': continue val = request.POST.get(d.symbol).strip() if val: kvs.append(KeyValue.objects.create(descriptor=d,value=val)) if kvs: if sitedata.exists(): si = SiteInfo.objects.create(site=sitedata[0], automatic=False) si.values.add(*kvs) si.save() elif product.exists(): pi = ProductInfo.objects.create(product=product[0], automatic=False) pi.values.add(*kvs) pi.save() else: url = request.POST['URL_PROD'] name = request.POST['NAME_PROD'] if not scraper.exists(): if site.exists(): scraper = ScraperDescriptor.objects.create(site=site[0] ) else: return else: scraper = scraper[0] scraper.url=url scraper.name=name scraper.items.clear() scraper.items.add(*kvs) scraper.save() page = full_html_page(url) obj = json.dumps(page_to_dict(page)) for st in ScraperTemplate.objects.filter(descriptor=scraper): st.active = False st = ScraperTemplate.objects.create(descriptor=scraper, value=obj) st.save()
def get_category(request, group = None): if not group: group = get_group(request) default_category = None try: default_category = SiteCategory.objects.get(default=True,group__in = (group,)) default_category = default_category.symbol except: pass return request.GET.get('category', default_category)
def scrap_site_info(request, siteid, edit, *args, **kwargs): edit = edit == 'edit' group = get_group(request) descriptors = KeyValueDescriptor.objects.filter(active=True, group__in=(group,), target__symbol='SiteInfo').order_by('order') site = SiteData.objects.get(id=siteid) results = get_info(SiteInfo.objects.filter(site__id=siteid), descriptors, edit=edit) return { 'sitedata': site, 'results': results, 'edit': edit, }
def scrap(request, *args, **kwargs): page = request.GET.get('page','1') banned = 'banned' in request.GET group = get_group(request) results = Product.objects.filter(group=group,banned=banned) results, pages, _ = paginate(results, page, 15) return { 'results': results, 'pages': pages, 'banned': banned, }
def scrap_product_info(request, productid, edit, *args, **kwargs): edit = edit == 'edit' group = get_group(request) descriptors = KeyValueDescriptor.objects.filter(active=True, group__in=(group,), target__symbol='ProductInfo').order_by('order') product = Product.objects.get(id=productid) results = get_info(ProductInfo.objects.filter(product__id=productid), descriptors, edit=edit) return { 'product': product, 'results': results, 'edit': edit, }
def search_add(request, forms): group = get_group(request) form = forms['add'] if not form.is_valid(): return HttpResponseServerError() form.save() search = form.instance search.manual = True search.group = group search.save() messages.success(request, 'Data successfuly saved.') return { 'searchid': search.id }
def search_results_all(request, forms): group = get_group(request) category = get_category(request, group) format = request.GET.get('format') page = request.GET.get('page','1') q = request.GET.get('find','') categories = SiteCategory.objects.filter(active=True, group__in=(group,)).order_by("id") results = SearchResult.objects.filter(search__group = group).distinct('site__id','site__fresh', 'site__site__name') if q: results = results.filter(site__site__url__icontains=q) categories = list((cat,results.filter(site__category=cat,site__banned=False).count()) for cat in categories) result_counts = dict( banned = 0, # speed up # results.filter(site__banned=True).count(), other = results.filter(site__category__isnull=True,site__banned=False).count(), ) if category: if category == 'banned': results = results.filter(site__banned=True) else: if category == 'other': results = results.filter(site__category__isnull=True, site__banned=False) else: results = results.filter(site__category__symbol=category, site__banned=False) results = results.order_by('-site__fresh', 'site__site__name') if format == 'csv': return get_csv(results, category) else: results, pages, _ = paginate(results, page, 15) return { 'forms': forms, 'results': results, 'pages': pages, 'categories': categories, 'category': category, 'result_counts': result_counts, 'q': q, }
def scrap_site_edit(request, siteid, scraperid= None, *args, **kwargs): group = get_group(request) site= SiteData.objects.get(id=siteid) descriptors = KeyValueDescriptor.objects.filter(active=True, visible=True, group__in=(group,), target__symbol='ProductInfo').order_by('order') items = {} scraper = None if scraperid: scraper = ScraperDescriptor.objects.filter(id=scraperid) if scraper.exists(): scraper = scraper[0] items = dict((i.descriptor.symbol, i) for i in scraper.items.all()) results = [(items.get(d.symbol),d) for d in descriptors if d.symbol != 'URL_PROD'] return { 'site': site, 'scraper': scraper, 'results': results, 'edit': True, }
def scrap_export(request, typ, attr, *args, **kwargs): group = get_group(request) site = request.GET.get('site', None) product = request.GET.get('product', None) category = ('approved',) qs = [] descriptors = KeyValueDescriptor.objects.filter(active=True, group__in=(group,),) if typ == 'product': qs = ProductInfo.objects.filter(product__page__site__category__symbol__in = category) if site: qs = qs.filter(product__page__site__id=site) descriptors = descriptors.filter(target__symbol='ProductInfo') fn = 'product_info' elif typ == 'site': qs = SiteInfo.objects.filter(site__category__symbol__in = category) if site: qs = qs.filter(site__id = site) descriptors = descriptors.filter(target__symbol='SiteInfo') fn = 'site_info' elif typ == 'history': descriptors = descriptors.filter(symbol = attr) if not descriptors.exists(): return if all(map(lambda d: d.target.symbol == 'SiteInfo', descriptors)): qs = SiteInfo.objects.filter(site__category__symbol__in = category, values__descriptor__in = descriptors) if site: qs = qs.filter(site__id = site) elif all(map(lambda d: d.target.symbol == 'ProductInfo', descriptors)): qs = ProductInfo.objects.filter(product__page__site__category__symbol__in = category, values__descriptor__in = descriptors) if product: qs = qs.filter(product__id=product) fn = '%s_history' % descriptors[0].symbol else: return descriptors = descriptors.order_by('order') return get_info_csv(qs, descriptors, fn)
def scrap_site(request, siteid, *args, **kwargs): page = request.GET.get('page','1') banned = 'banned' in request.GET group = get_group(request) sd = SiteData.objects.filter(id=siteid, group=group) if sd.exists(): sd = sd[0] results = Product.objects.filter(page__site=sd,banned=banned) else: results = [] sd = None scrapers = ScraperDescriptor.objects.filter(site=sd) results, pages, _ = paginate(results, page, 15) descriptors = KeyValueDescriptor.objects.filter(active=True, group__in=(group,), ) return { 'site': sd, 'results': results, 'pages': pages, 'scrapers': scrapers, 'descriptors': descriptors, 'screenshots': PrintScreen.objects.filter(site=sd).order_by('-date'), 'banned': banned, }
def scrap_plugin(request, *args, **kwargs): group = get_group(request) descriptors = KeyValueDescriptor.objects.filter(active=True, group__in=(group,), target__symbol='ProductInfo').order_by('order') return { 'descriptors': descriptors, }
def site_add(request, forms, searchid): group = get_group(request) search = Search.objects.get(id=int(searchid)) nexturl = request.GET.get('nexturl') site_form = forms['site'] search_result_form = forms['search_result'] try: # TODO move it to Model constructor site = Site.objects.get(url=url_depath(site_form.data.get('url'))) site_form = SiteForm(request.POST, instance=site) except: pass sequence = search_result_form.data.get('sequence') if not site_form.is_valid() or not search_result_form.is_valid(): return site=site_form.save() try: site_data = SiteData.objects.get(site=site, group=group) except: site_data = SiteData.objects.create(site=site, group=group,found=search) category = get_category(request, group) if category: if category == 'banned': site_data.banned = True else: site_data.banned = False if category == 'other': site_data.category = None else: site_data.category = SiteCategory.objects.get(symbol = category) category_changed(site_data) site_data.manual = True site_data.save() search_result = SearchResult.objects.filter(site=site_data, search=search) if search_result: search_result = search_result[0] else: search_result = SearchResult(site=site_data, search=search, ) str2list = combinator( lambda s: list(s) if isinstance(s,tuple) else [s], lambda s: eval(s) if s else tuple() ) list2str = lambda s: ','.join(map(str,s)) sequence = str2list(sequence) if search_result: sequence += str2list(search_result.sequence) sequence = sorted(set(sequence)) search_result.sequence = list2str(sequence) search_result.seq = sequence[0] if len(sequence)>0 else 0 search_result.save()
def __init__(self, *args, **kwargs): super(SearchForm, self).__init__(*args, **kwargs) if self._request: group = get_group(self._request) qs = self.fields['engine'].queryset self.fields['engine'].queryset = qs.filter(group__in=(group,)).distinct()
def search_results(request, searchid, forms ): group = get_group(request) category = get_category(request, group) format = request.GET.get('format') page = request.GET.get('page','1') q = request.GET.get('find','') try: search = Search.objects.get(id=int(searchid), group=group) except: return HttpResponseNotFound() if is_categorizing(search): messages.warning(request, 'Categorization under progress. Try refresh to check status.') else: s = is_success(search) if s is not None: if s: messages.success(request, 'Categorization succesfully ended.') else: messages.error(request, 'Categorization failed.') forms['search'] = forms['search2'] = SearchForm(instance=search, request=request) categories = SiteCategory.objects.filter(active=True, group__in=(group,)).order_by("id") if search: results = SearchResult.objects.filter(search = search) if q: results = results.filter(site__site__url__icontains=q) categories = list((cat,results.filter(site__category=cat,site__banned=False).count()) for cat in categories) result_counts = dict( banned = 0, # speed up # results.filter(site__banned=True).count(), other = results.filter(site__category__isnull=True,site__banned=False).count(), ) if category: if category == 'banned': results = results.filter(site__banned=True) else: if category == 'other': results = results.filter(site__category__isnull=True, site__banned=False) else: results = results.filter(site__category__symbol=category, site__banned=False) results = results.order_by('-site__fresh', 'site__site__name','seq') else: categories = list((cat,0) for cat in categories) result_counts = dict(banned=0, other=0) results = [] if format == 'csv': return get_csv(results, category) else: results, pages, _ = paginate(results, page, 15) return { 'forms': forms, 'results': results, 'pages': pages, 'categories': categories, 'category': category, 'result_counts': result_counts, 'q': q, 'search': search, }
def do_search(query, engine, user=None, count=20, categorize=True): if not isinstance(user, User) and user is not None: user = User.objects.get(username=user) if not isinstance(engine, Engine): engine = Engine.objects.get(symbol=engine) if not isinstance(query, Query): q = query query = None else: q = query.q if engine.symbol not in engines: return logger.debug('Invoking query "%s" using engine "%s".' % (q, engine.name)) results = engines[engine.symbol](q=q) group = None if query: search = Search.objects.create(engine=engine, query=query, q=q) group = query.group else: search = Search.objects.create(engine=engine, q=q) if user: group = get_group(user) search.group = group search.manual = user is not None search.user = user search.save() logger.debug('Search id=%d.' % (search.id, )) for res in itake(count, results): seq = res.get('_seq') + 1 bare_url = url_depath(res['url']) # remove path from url site_data = site = None try: site = Site.objects.get(url=bare_url) site_data = SiteData.objects.get(site=site, group=group) except Exception: pass if site_data: logger.debug('%02d. Old result [id=%d] "%s".' % ( seq, search.id, res['url'], )) # once processed in this run, we continue try: sr = SearchResult.objects.get(site=site_data, search=search) if sr: sr.sequence += ', %s' % seq sr.save() continue except: pass fresh = False if not site_data: logger.debug('%02d. New result [id=%d] "%s".' % ( seq, search.id, res['url'], )) if not site: site = Site(name=res.get('title', ) or '(no title found)', url=bare_url) site.save() fresh = True site_data = SiteData(site=site, group=group, banned=False, fresh=fresh, found=search) site_data.save() search_result = SearchResult.objects.create(search=search, sequence=seq, seq=seq, site=site_data) search_result.save() if categorize: ThreadList.thread(search.id, partial(do_categorize, search)) return search
def do_search(query, engine, user=None, count=20, categorize=True): if not isinstance(user, User) and user is not None: user = User.objects.get(username=user) if not isinstance(engine,Engine): engine = Engine.objects.get(symbol=engine) if not isinstance(query,Query): q = query query = None else: q = query.q if engine.symbol not in engines: return logger.debug('Invoking query "%s" using engine "%s".'%(q, engine.name)) results = engines[engine.symbol](q=q) group = None if query: search = Search.objects.create(engine=engine, query=query, q=q) group = query.group else: search = Search.objects.create(engine=engine, q=q) if user: group = get_group(user) search.group = group search.manual = user is not None search.user = user search.save() logger.debug('Search id=%d.'%(search.id,)) for res in itake(count, results): seq = res.get('_seq')+1 bare_url = url_depath(res['url']) # remove path from url site_data = site = None try: site = Site.objects.get(url=bare_url) site_data = SiteData.objects.get(site=site, group=group) except Exception: pass if site_data: logger.debug('%02d. Old result [id=%d] "%s".'%(seq,search.id,res['url'],)) # once processed in this run, we continue try: sr = SearchResult.objects.get(site=site_data, search=search) if sr: sr.sequence += ', %s'% seq sr.save() continue except: pass fresh = False if not site_data: logger.debug('%02d. New result [id=%d] "%s".'%(seq,search.id,res['url'],)) if not site: site = Site(name=res.get('title',) or '(no title found)', url=bare_url) site.save() fresh = True site_data = SiteData(site=site, group=group, banned=False, fresh=fresh, found=search) site_data.save() search_result = SearchResult.objects.create(search=search, sequence=seq, seq=seq, site=site_data) search_result.save() if categorize: ThreadList.thread(search.id, partial(do_categorize,search)) return search