def get_context_data(self, **kwargs): context = super(AdminConsoleView, self).get_context_data(**kwargs) #get profile cl = CustomAppSettings.get_mongo() db = cl.webscr_profiles #get current user user = self.request.session['user'] #populate context context['common'] = dict() context['common']['title'] = 'User management console' context['common']['current_menu_title'] = 4 context['common']['user'] = user #get list of users profiles_cursor = db.users.find() #create id attribute since _id can't be used profiles = [] for profile in profiles_cursor: profile['id'] = str(profile['_id']) profile['is_admin'] = profile['role'] == 'admin' profiles.append(profile) #raise Exception(profiles) context['profiles'] = profiles context['common']['version'] = CustomAppSettings.get_version() return context
def get_context_data(self, **kwargs): context = super(IndexView, self).get_context_data(**kwargs) # Get current user u = self.get_user( CustomAppSettings.get_vissbl_user_id() ) # Get stats #stats = u.get_user_stats_object() #top_by_date = stats.get_top_results() # Get reference to the last rank #last_rank = top_by_date[-1] if len(top_by_date) > 0 else {'value':{'ranks':[]}} # Populate context #context['top_sites_json'] = json.dumps(top_by_date, default=json_util.default) #context['itms'] = last_rank['value']['ranks'][:30] context['top_count'] = 30 context['common'] = dict() context['common']['title'] = 'Vissbl - real online ranking' context['common']['current_menu_item'] = 0 context['common']['user'] = u context['common']['version'] = CustomAppSettings.get_version() return context
def get_context_data(self, **kwargs): context = super(MonitorView, self).get_context_data(**kwargs) itms = self.object_list #get date from which search d = datetime.datetime.utcnow() d = d.replace(hour=0, minute=0, second=0, microsecond=0) #get current user user = self.request.session['user'] #populate context context['common'] = dict() context['itms'] = itms context['from_date'] = d context['total_results'] = self.total_results context['n'] = self.n if self.total_results > self.n else self.total_results context['common']['title'] = 'Monitor scheduled tasks' context['common']['current_menu_item'] = 1 context['common']['user'] = user context['common']['version'] = CustomAppSettings.get_version() return context
def get_context_data(self, **kwargs): context = super(ScheduledJobsView, self).get_context_data(**kwargs) max_allowed_jobs, active_jobs, inactive_jobs = self.object_list #get current user user = self.request.session['user'] active_freq, freqs = Frequency().get_context_friendly_frequencies() #populate context context['itms'] = active_jobs + inactive_jobs context['max_allowed_jobs'] = max_allowed_jobs context['active_jobs_count'] = len(active_jobs) context['inactive_jobs_count'] = len(inactive_jobs) context['is_new'] = False context['selected_frequency'] = active_freq context['frequencies'] = freqs context['job_form_action'] = './' context['common'] = dict() context['common']['title'] = 'Scheduled jobs' context['common']['current_menu_item'] = 2 context['common']['user'] = user context['common']['version'] = CustomAppSettings.get_version() return context
def get_queryset(self): cl = CustomAppSettings.get_mongo() query = self.create_query_doc() #get the user user = self.request.session['user'] #query db = cl[user['db']] res = db.urls \ .find(query, {'date_scraped': 1, 'estimated_res': 1, 'task_id': 1, 'url': 1, 'query': 1, 'domain': 1, 'scheduled': 1}) \ .sort([('date_scraped', -1)]) #find total self.total_results = res.count() #convert to array queryset = [el for el in res.limit(self.n)] res.close() cl.close() return queryset
def clean(self): """ Make sure the secret key exists """ #get cleaned data and validate form cleaned_data = super(LoginForm, self).clean() #if found errors - return if self._errors: return cleaned_data #encode secret_key secret_key = cleaned_data.get('secret_key') #get user id cl = CustomAppSettings.get_mongo() db = cl.webscr_profiles profile = db.users.find_one({'secret_key': secret_key},{'_id':1}) if profile: cleaned_data['profile_id'] = str(profile['_id']) else: msg = u"Invalid secret key" self._errors['secret_key'] = self.error_class([msg]) del cleaned_data['secret_key'] return cleaned_data
def get_context_data(self, **kwargs): context = super(ExportPreviewView, self).get_context_data(**kwargs) itms = self.object_list #get current user user = self.request.session['user'] #populate context context['common'] = dict() context['itms'] = itms context['from_date'] = self.d_from context['to_date'] = self.d_to context['kw'] = self.kw if len(self.kw) > 0 else '*' context['kw_placeholder'] = itms[0]['query']['q'] if len(itms) > 0 else '' context['domain'] = self.domain if len(self.domain) > 0 else '*' context['domain_placeholder'] = itms[0]['domain'] if len(itms) > 0 else '' context['scheduled'] = self.scheduled context['total_results'] = self.total_results context['n'] = self.n if self.total_results > self.n else self.total_results context['common']['title'] = 'Export data' context['common']['current_menu_item'] = 3 context['common']['user'] = user context['common']['version'] = CustomAppSettings.get_version() return context
def generate_secret_key(self): secret_key = CustomAppSettings.get_new_secret_key() # Insert the key into the db db = self.get_user_profiles_db() res = db.users.update(spec={'_id':ObjectId(self.get_user_id())}, document={'$set':{'secret_key': secret_key}}, upsert=True) return secret_key, res
def query(self): u = self.get_user( CustomAppSettings.get_vissbl_user_id() ) # Get stats stats = User(u).get_user_stats_object() data = stats.get_top_results() # Get reference to the last date of ranks last_date = data[-1] if len(data) > 0 else {'value':{'ranks':[]}} return {"data": data, "last_date_ranks": last_date['value']['ranks'][:30]}
def _track_proxy_connection(self, ): cl = CustomAppSettings.get_mongo() db = self._db d = datetime.datetime.utcnow() #increase calls count db.calls.find_and_modify( query = {'date.y':d.year, 'date.m': d.month, 'date.d': d.day, 'date.h': d.hour, 'date.n': d.minute, 'date.s': d.second}, update = {'$inc': {'count': 1}}, upsert = True)
def get_context_data(self, **kwargs): context = super(LoginView, self).get_context_data(**kwargs) #populate context context['common'] = dict() context['common']['title'] = 'Login' context['common']['current_menu_item'] = 3 context['common']['version'] = CustomAppSettings.get_version() #ignore login context['ignore_login'] = True return context
def get_redirect_url(self, *args, **kwargs): """ Delete indicated user if current user is Admin """ #get current user cl = CustomAppSettings.get_mongo() db = cl.webscr_profiles #delete requested user if current user is admin if self.request.session['user']['is_admin']: #prevent disactivating admins db.users.update({'_id': ObjectId(kwargs['profile_id']), 'role': {'$ne': 'admin'}}, {'$inc': {'is_active': 1}, '$set': {'actdeact': datetime.datetime.utcnow()}}) return reverse('webscraper:userconsole')
def get_context_data(self, **kwargs): context = super(ExportPreviewTaskView, self).get_context_data(**kwargs) cl = CustomAppSettings.get_mongo() user = self.request.session['user'] db = cl[user['db']] res = db.urls \ .find_one({'task_id': self.taskid}, {'results': 1}) if res: res = res['results'] #convert to array context['itms'] = res return context
def do_edit(self, kwargs): #return false if user is not admin if not self.request.session['user']['is_admin']: return 'NOT ADMIN' #get new alias and id alias = self.request.POST.get('alias') id = kwargs['profile_id'] #ignore if alias is empty if len(alias.strip()) == 0: return 'WRONG ALIAS' #do update cl = CustomAppSettings.get_mongo() db = cl.webscr_profiles db.users.update({'_id': ObjectId(id)}, {'$set': {'alias': alias}}) return 'OK'
def maintain(self): from webscraper.entities import User users = User.get_active_users() #mongodb can have this limit on IN case MAX_IN_LIMIT = 4000000 #count total deletes tot = 0 #get celery_db connection cl = CustomAppSettings.get_mongo() celery_db = cl.celery_db # remove old search results from each db for user in users: #get old task ids old_task_ids = user.get_old_task_ids()[:MAX_IN_LIMIT] count = len(old_task_ids) tot += count #db db = user.get_user_db() #remove old userdb.task_meta db.task_meta.remove({'_id': {'$in': old_task_ids}}) #remove old userdb.urls db.urls.remove({'task_id':{'$in': old_task_ids}}) #remove old celerydb.task_meta celery_db.task_meta.remove({'_id':{'$in': old_task_ids}}) #write user profile statistics on removed db = user.get_user_profiles_db() col = db.cleaning_stats d = datetime.datetime.utcnow() doc = {'user': user.get_user_id(), 'date_done': d, 'total_removed': count} col.insert(doc) return {'total': tot}
def get_queryset_csv(self): """gets data from db, dumps into excel stream and retuns the stream""" query = self.create_query_doc() #get data cl = CustomAppSettings.get_mongo() user = self.request.session['user'] db = cl[user['db']] res = db.urls \ .find(query) \ .sort([('date_scraped', -1)]) #create csv stream fname = 'export_{0}_{1}.csv'.format(self.d_from.strftime('%Y-%m-%d %H:%M:%S'),self.d_to.strftime('%Y-%m-%d %H:%M:%S')) csvstream = self.create_csv_stream(res) return csvstream, fname
def get_queryset_excel(self): """gets data from db, dumps into excel stream and retuns the stream""" query = self.create_query_doc() #get data cl = CustomAppSettings.get_mongo() user = self.request.session['user'] db = cl[user['db']] res = db.urls \ .find(query) \ .sort([('date_scraped', -1)]) queryset = [el for el in res] res.close() cl.close() #create excel stream excelstream = self.create_excel_stream(queryset) return excelstream
def get_context_data(self, **kwargs): context = super(UserConsoleView, self).get_context_data(**kwargs) #get current user user = self.request.session['user'] self.user = user #get user statistics userinfo = self._get_user_info() userinfo['info']['requests_per_day_json'] = json.dumps(userinfo['info']['requests_per_day'], default=json_util.default) context['userinfo'] = userinfo #populate context context['common'] = dict() context['common']['title'] = 'User console' context['common']['current_menu_item'] = 4 context['common']['user'] = user context['common']['version'] = CustomAppSettings.get_version() return context
def get_context_data(self, **kwargs): context = super(ScrapeView, self).get_context_data(**kwargs) #get current user user = self.request.session['user'] u = self.get_user() active_freq, freqs = Frequency().get_context_friendly_frequencies() #populate context context['is_new'] = True context['can_schedule'] = u.can_schedule_urls() context['selected_frequency'] = active_freq context['frequencies'] = freqs context['job_form_action'] = './jobs/' context['has_points'] = u.has_points() context['is_active'] = True context['common'] = dict() context['common']['title'] = 'Webscraper' context['common']['current_menu_item'] = 0 context['common']['user'] = user context['common']['version'] = CustomAppSettings.get_version() return context
def _get_db_connection(self): return CustomAppSettings.get_mongo()
def _db_write_res(self, **kwargs): """write result into db; doc structure: { task_id: '' date_scraped: date, url: '', group_name: '', domain:'', query:{q:'', start:0, num:0}, estimated_res: 0, scheduled: false, results: [{ type: '', title: '', title_url: '', tld:'', content: '', content_time: '', related_links: ['','',''...] }, {...}], typed_results:{ url: [res_id, res_id, res_id...], ...} }""" #instantiate mongodb client client = CustomAppSettings.get_mongo() #connect to db db = client[self.user['db']] #get collection urls = db.urls #create query object db_query = {"q":'', 'num':0, 'start':0} url_query = kwargs['query'] for k in url_query.keys(): if db_query.has_key(k): db_query[k] = url_query[k] #create doc d = datetime.datetime.utcnow() doc = { "task_id": kwargs['task_id'], "date_scraped": datetime.datetime.utcnow(), "url": kwargs['url'], "group_name": kwargs['group_name'], "domain": kwargs['domain'], "query": db_query, "estimated_res": kwargs['tot_res'], "results": kwargs['results'], "typed_results": {}, "scheduled": self.is_scheduled } #insert typed_results into doc typed = doc['typed_results'] for ind, el in enumerate(kwargs['results']): el['#'] = ind t = el['type'] if not typed.has_key(t): typed[t] = [] typed[t].append(ind) #insert doc doc_id = urls.insert(doc) return doc_id
def _give_points_back(self, reduction): cl = CustomAppSettings.get_mongo() db = cl.webscr_profiles return db.users.update(spec={'_id': ObjectId(self.user['_id'])}, document={'$inc': {'points': reduction}}, new = True)
def _reduce_points(self, reduction): cl = CustomAppSettings.get_mongo() db = cl.webscr_profiles return db.users.find_and_modify(query={'_id': ObjectId(self.user['_id'])},update={'$inc':{'points': -reduction}}, new = True)
def scrape(self, url, return_data): current_task = self.current_task logger = self.logger url = url.encode('utf-8') #change status current_task.update_state(state=u'STARTED', meta={'url': url, 'group': self.group_name}) logger.info('TASK EXECUTING: %r, args: %r kwargs: %r' % ( self.current_task.request.id, self.current_task.request.args, self.current_task.request.kwargs)) #avoid doing anything if url is empty if len(url) == 0: self._wrong_param_exception(url) #parse url u = urlparse.urlparse(url) #add scheme if missing if u.scheme == '': url = 'http://' + url u = urlparse.urlparse(url) #get netloc netloc = u.netloc #get parsed query qs = urlparse.parse_qs(u.query) start = (qs['start'] if 'start' in qs else ['0'])[0] start = int(start) #convert qs elements from array to normal strings for k in qs.keys(): el = qs[k] el = ' '.join(el) #try to convert number strings into numbers new_k = k.lower() if new_k == 'num' or new_k == 'start': el = el.replace(' ', '') try: el = int(el) except Exception: pass qs[k] = el #add default values for num and start if there are none if not 'num' in qs: qs['num'] = 10 if not 'start' in qs: qs['start'] = 0 #get domain name of the query psl = PublicSuffixList() query_domain = psl.get_public_suffix(netloc) #check if it is google parts = query_domain.split(u'.') scraped_docs = '' if len(parts) > 0 and parts[0].upper() == u'GOOGLE': current_task.update_state(state=u'CONNECTING', meta={'url': url, 'group': self.group_name}) #create request req = { 'url': urllib.quote_plus(url, "%/:=&?~#+!$,;'@()*[]"), 'referer': u'http://google.com', 'useragent': u'Webscraper/' + CustomAppSettings.get_version() + ' (+http://www.yriver.it/webscraper/)',#'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.47 Safari/537.36', 'region': u'gb', 'priority': u'1' } #define which scrape key to take scrape_key = get_scrape_key_special if self.plan['is_special'] else get_scrape_key #make query query = {'key': scrape_key(), 'request': req} p = ProxyConnection(self.user, self.plan) html = p.send_request(query) #parse html scraped_docs = '' if len(html) > 0 and html != '0': scraped_docs, tot_res_est = self._parse_html(html) #write into db self._db_write_res(task_id=current_task.request.id, url=url, group_name=self.group_name, results=scraped_docs, tot_res=tot_res_est, start=start, query=qs, domain=query_domain) # Convert to Base64 if return_data = True if return_data: encoded_result = base64.standard_b64encode(json.dumps(scraped_docs)) return {'url': url, 'group_name': self.group_name, 'domain': query_domain, 'b64_json': encoded_result} else: return {'url': url, 'group_name': self.group_name}
def __init__(self, user, plan): socket.setdefaulttimeout(30) self._db = CustomAppSettings.get_mongo()[user['db']] self.plan = plan self.user = user
def query(self, tld): u = self.get_user( CustomAppSettings.get_vissbl_user_id() ) stats = User(u).get_user_stats_object() res = stats.get_ranks_for_tld(tld) return res
def get_mongo(self): if not self.cl: self.cl = CustomAppSettings.get_mongo() return self.cl
def create_user(alias='', plan_name='basic', email='', facebook_user={}, *args, **kwargs): db = User({}).get_user_profiles_db() u = None fb_id = '' # If Facebook user is supplied, use it as the highest priority if facebook_user: # Create a brand new user alias = facebook_user.get('name', '').strip() email = facebook_user.get('email', '').strip() fb_id = facebook_user.get('id', '').strip() elif email: email = email.strip() if User({})._validate_email(email): # Make sure email does not exist u = db.users.find_one({'contacts.email': email.lower(), 'role': 'user'}) if u: return {'error': 'Email already occupied'} else: return {'error': 'Invalid email'} # ---- Green light for creating new user #get plan plan = db.plans.find_one({'name': plan_name}) # Get db index db_ind = db.counters.find_and_modify( query={'_id': 'dbindex'}, update={'$inc': {'seq': 1}}, upsert=True, new=True )['seq'] db_ind = int(db_ind) # Get new secret key secret_key = CustomAppSettings.get_new_secret_key() # Get time d = datetime.datetime.utcnow() # Create doc doc = { 'alias': alias.strip(), 'secret_key': secret_key.lower(), 'db_ind': db_ind, 'role': 'user', 'is_active': 0, 'created': d, 'actdeact': d, 'last_access': None, 'plan': {'id': plan['_id'], 'date': d}, 'plans_history': [], 'contacts': {'email': email.lower(), 'cell': ''}, 'points': 100, 'facebook_id': fb_id } # # Add additional fields # for k, v in kwargs.iteritems(): # if not k in doc: # doc[k] = v # Try to write userid = db.users.insert(doc) # Create indexes on user db cl = User({}).get_mongo() db = cl['webscr_' + str(db_ind)] db.urls.ensure_index('date_scraped', 1) db.task_meta.ensure_index('date_done', 1) return {'profile_id': userid}