def odesk_rss(run_at, query): format = '%Y/%m/%d %T' sheets = Sheet.select() for sheet in sheets: if sheet.name.lower()[0:9] != 'odesk rss': continue if sheet.run_at == 'never': continue elif sheet.run_at == run_at: processed_sheet = [] buf = StringIO(sheet.asheet) sheet_dict = json.load(buf) sheet_list = sheet_dict['data'] sheet_list = clear_sheet(sheet_list) try: feeder = feedparser.parse('https://www.odesk.com/jobs/rss?q=' + query) for posted_job in feeder.entries: new_row = [] # pubdate = str(parse_date(str(posted_job.published)).strftime(format)) # new_row.append(str(pubdate)) # new_row.append(str(posted_job.title)) # new_row.append(str(posted_job.link)) pubdate = parse_date(str( posted_job.published)).strftime(format) new_row.append(pubdate) new_row.append(posted_job.title.encode('ascii', 'ignore')) new_row.append(posted_job.link.encode('ascii', 'ignore')) # now = str(parse_date(str(datetime.utcnow())).strftime(format)) # new_row.append(str(now)) processed_sheet.append(new_row) except Exception as e: # ERROR: Error: running function: odesk_rss ... exception: 'ascii' codec can't encode character u'\u0421' in position 0: ordinal not in range(128) # a solution: ".encode('ascii', 'ignore')" as "str()" doesn't work error = "Error: running function: odesk_rss ... exception: %s" % e new_row.append(str(error)) processed_sheet.append(new_row) for ps in processed_sheet: sheet_list.append(ps) # store in db with the same format as handsontable i.e. JSON.stringify(hot.getData()): new_sheet_str = "{'data': " + str(sheet_list).replace( 'None', "").replace('null', "") + "}" new_sheet_str = new_sheet_str.replace("'", '"') sheet.asheet = new_sheet_str sheet.updated_at = datetime.utcnow() print("updated=%s name=%s" % (sheet.updated_at, sheet.name)) ssresp = sheet.save() print("sheet.save() response:") print(ssresp) print( "%s Completed '%s' autorun of oDesk RSS job: query: '%s' processed sheet '%s'" % (datetime.utcnow(), run_at, query, sheet.name))
def trending(run_at): # a sheet template for trending must be like this: # row 1 is the headers # row 2: # col 1+ are the function params (url/user) - may be multiples # trailing col's are the ?func(x2)'s to be performed # after each run row 3+ will be the results: # row 3: # col 1 is the timestamp # col 2+ are the result of each ?func call # note: without some fixed layout this would difficult to perform, also # since it's basically a report it would be ugly/unreadable if # there wasn't a layout # print("%s run_at=%s"%(datetime.utcnow(),run_at)) # return sheets = Sheet.select() for sheet in sheets: if sheet.name.lower()[0:9] == 'odesk rss': continue if sheet.run_at == 'never': continue elif sheet.run_at == run_at: processed_sheet = [] buf = StringIO(sheet.asheet) sheet_dict = json.load(buf) sheet_list = sheet_dict['data'] sheet_list = remove_empty_rows(sheet_list) for x, row in enumerate(sheet_list): # ignore the first row, x == 0 # process the second row, x == 1 # ignore all other rows/cols if (x == 0) or all_nones(row): continue elif x == 1: # only process row 2 processed_sheet.append(process_row(row, sheet_list)) else: continue for ps in processed_sheet: sheet_list.append(ps) # store in db with the same format as handsontable i.e. JSON.stringify(hot.getData()): new_sheet_str = "{'data': " + str(sheet_list).replace( 'None', '').replace('null', '') + "}" new_sheet_str = new_sheet_str.replace("'", '"') sheet.asheet = new_sheet_str sheet.updated_at = datetime.utcnow() sheet.save() print("%s Completed autorun trending job: processed sheet '%s'" % (datetime.utcnow(), sheet.name))
def __init__(self, title, content, owner, date=None): self.title = title self.content = content self.owner = owner if date is None: date = datetime.utcnow() self.date = date
def scrape(video_counter): guser = werkzeug_cache.get('guser') gs = Gspreadsheet(guser.gmail, guser.gpassword, None) gs.login() ss = gs.gclient.open_by_url(video_counter.gspread_link) ws = ss.sheet1 urls = gs.col_one(ws) results = [] try: pool = Pool(flask_app.config['MULTIPROCESSING_POOL_SIZE']) # video_page_urls = Channel.get_video_page_urls(channel) # results = pool.map(Channel.get_video_data, video_page_urls) results = pool.map(Channel.get_video_data, urls) now_timestamp = datetime.utcnow() nrow = 2 for i in range(len(results)): # gspread update cells in row: acells = ws.range("B%s:E%s" % (nrow, nrow)) acells[0].value = results[i]['title'] acells[1].value = results[i]['views'] acells[2].value = results[i]['likes'] acells[3].value = results[i]['dislikes'] ws.update_cells(acells) c = Channel.create(name=video_counter.name, channel='', url=results[i]['url'], title=results[i]['title'], views=results[i]['views'], likes=results[i]['likes'], dislikes=results[i]['dislikes'], timestamp=now_timestamp) nrow += 1 except Exception as e: print("Error: Channel:channel_scrape:\n%s" % e) return len(results)
def crawlers_create(): form = CrawlersForm(request.form) form.validate() new_crawler = True if form.errors: pass else: now = datetime.utcnow() crawler = Crawler.create( name=request.form.get('name'), runnable=request.form.get('runnable'), gspread_link=request.form.get('gspread_link'), # url=request.form.get('url'), url=None, crawl_status=None, crawled_at=None, created_at=now, updated_at=now) new_crawler = False form = CrawlersForm(None, crawler) flash('Crawler was created') return redirect(url_for('crawlers.crawlers_list')) return render_template('crawler.html', current_user=current_user, form=form, new_crawler=new_crawler)
def functions_create(): form = UserFunctionForm(request.form) form.validate() # params = request.form.get('params') # params_stripped = "\n".join([ll.strip() for ll in params.splitlines() if ll.strip()]) new_function = True if form.errors: pass else: now = datetime.utcnow() function = UserFunction.create( name=request.form.get('name'), runnable=request.form.get('runnable'), function=request.form.get('function'), gspread_link=request.form.get('gspread_link'), # params=params_stripped, params=None, created_at=now, updated_at=now) new_function = False form = UserFunctionForm(None, function) flash('User function was created') return redirect(url_for('functions.functions_list')) ufs = get_user_functions() return render_template('function.html', current_user=current_user, ufs=ufs, form=form, new_function=new_function)
def crawlers_update(): # FIXME ugly code to do both edit/update if request.method == 'GET': id = request.args.get('id', '') crawler = Crawler.get(Crawler.id == id) form = CrawlersForm(None, crawler) else: id = request.form.get('id') crawler = Crawler.get(Crawler.id == id) form = CrawlersForm(request.form) form.validate() if form.errors: pass else: now = datetime.utcnow() crawler.name = request.form.get('name') crawler.runnable = request.form.get('runnable') crawler.gspread_link = request.form.get('gspread_link') crawler.url = None crawler.updated_at = now crawler.save() new_crawler = False form = CrawlersForm(None, crawler) flash('Crawler was updated') return redirect(url_for('crawlers.crawlers_list')) return render_template('crawler.html', current_user=current_user, form=form, new_crawler=False, id=id)
def trending(run_at): # a sheet template for trending must be like this: # row 1 is the headers # row 2: # col 1+ are the function params (url/user) - may be multiples # trailing col's are the ?func(x2)'s to be performed # after each run row 3+ will be the results: # row 3: # col 1 is the timestamp # col 2+ are the result of each ?func call # note: without some fixed layout this would difficult to perform, also # since it's basically a report it would be ugly/unreadable if # there wasn't a layout # print("%s run_at=%s"%(datetime.utcnow(),run_at)) # return sheets = Sheet.select() for sheet in sheets: if sheet.name.lower()[0:9] == 'odesk rss': continue if sheet.run_at == 'never': continue elif sheet.run_at == run_at: processed_sheet = [] buf = StringIO(sheet.asheet) sheet_dict = json.load(buf) sheet_list = sheet_dict['data'] sheet_list = remove_empty_rows(sheet_list) for x, row in enumerate(sheet_list): # ignore the first row, x == 0 # process the second row, x == 1 # ignore all other rows/cols if (x == 0) or all_nones(row): continue elif x == 1: # only process row 2 processed_sheet.append(process_row(row, sheet_list)) else: continue for ps in processed_sheet: sheet_list.append(ps) # store in db with the same format as handsontable i.e. JSON.stringify(hot.getData()): new_sheet_str = "{'data': " + str(sheet_list).replace('None', '').replace('null', '') + "}" new_sheet_str = new_sheet_str.replace("'", '"') sheet.asheet = new_sheet_str sheet.updated_at = datetime.utcnow() sheet.save() print("%s Completed autorun trending job: processed sheet '%s'" % (datetime.utcnow(), sheet.name))
def crawlers(): crawlers = Crawler.select() for crawler in crawlers: if crawler.is_runnable(): # delete crawler before crawling it again: dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name) deleted_count = dq.execute() pages = CrawlerPage.crawl(crawler) print("%s job: crawlers processed" % datetime.utcnow())
def perform_function(func): now_timestamp = datetime.utcnow() results = {} name = func.name function = func.function.split('(') guser = werkzeug_cache.get('guser') gs = Gspreadsheet(guser.gmail, guser.gpassword, None) gs.login() ss = gs.gclient.open_by_url(func.gspread_link) ws = ss.sheet1 params = gs.col_one(ws) run_count = 1 for fp in params: result = {} args = fp.split(', ') result['function'] = function[0] result['params'] = args try: # test error handling: # if fp == 'italian greyhound': # raise Exception(fp) func_result = getattr(UserDefinedFunctions, function[0])(*args) result['result'] = func_result except Exception as e: aresult = "Exception: %s" % e result['result'] = [{'error' : aresult}] print("Error: running user function: %s\n\tException: %s" % (function[0], e)) pass results[run_count] = result run_count += 1 nrow = 2 for key, func_result in results.items(): try: is_error = 'error' in func_result['result'][0] # gspread update cells in row: acells = ws.range("B%s:F%s" % (nrow, nrow)) acell = 0 for fr in func_result['result']: if is_error: acells[acell].value = fr.get('error', '') else: acells[acell].value = fr.get('data', '') acell += 1 ws.update_cells(acells) func_res = FunctionResult.create( tag='e' if is_error else 'k', name=name, function=func_result['function'], params=','.join(func_result['params']), results=pickle.dumps(func_result['result']), timestamp=now_timestamp ) nrow += 1 except Exception as e: print("Error: perform_function:\n%s" % e) return results
def process_row(row, sheet_list): # overview: # - copy processed row to a new row by: # - keeping empty/None cells # - processing cells beginning with '?' # - otherwise keeping cell 'as is' # format='%a %b %d, %Y %T' format = '%Y/%m/%d %T' new_row = [] if type(row) is not list: return new_row for c, cell in enumerate(row): if c == 0: # timestamp this row now = str(parse_date(str(datetime.utcnow())).strftime(format)) new_row.append(str(now)) continue if cell is None: new_row.append('null') continue if (len(cell) > 0) and (cell[0] == '?'): # only process cells with '?' as 1st char # - which cell is being referenced ??? # e.g. ?tweets(a1) how to grab a1 ? # - use code like chrToNum from sheet.html # - ensure only row 2, really 1, is referenced # remove '?', '(', ')', and leading/trailing whitespace: func_params = re.sub(r'[\?,),(]', r' ', cell).strip() # split by space, so parts [0]=func [1]=cell reference: parts = func_params.split(' ') afunc = parts[0] cell_ref = parts[1] # ignore any other params cell_ref_col_letter = cell_ref[0] # can be any col A-Za-z (0-25) cell_ref_col_num = ord(cell_ref_col_letter) % 32 - 1 cell_ref_row = cell_ref[ 1:] # this must ref row 2 only, but any col if cell_ref_row == '2': param = sheet_list[int(cell_ref_row) - 1][int(cell_ref_col_num)] try: func_result = getattr(ServerSides, afunc)(param) result = func_result except Exception as e: result = "Error: running function: %s ... exception: %s" % ( cell, e) new_row.append(str(result)) else: new_row.append( str("INVALID row: must be 2, but was (%s) in %s" % (cell_ref, cell))) else: # new_row.append(str(cell)) # use 'null' instead any existing cell value: new_row.append('null') return new_row
def async_spider(app, crawler_id): now = datetime.utcnow() print("%s async_spider started..." % datetime.utcnow()) print("\tPID=%s" % os.getpid()) print("\tcrawler_id=%s" % crawler_id) crawler = Crawler.get(Crawler.id == crawler_id) if crawler.is_runnable: # delete crawled pages before crawling it again: dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name) deleted_count = dq.execute() crawler.crawled_at = now crawler.crawl_status = 'crawling' crawler.save() pages_len = 0 try: pages_len = CrawlerPage.crawl(crawler) finally: crawler.crawl_status = "crawled %s pages" % pages_len crawler.save() print("\tnumber of pages crawled=%s" % pages_len) print("%s async_spider ended" % datetime.utcnow())
def process_row(row, sheet_list): # overview: # - copy processed row to a new row by: # - keeping empty/None cells # - processing cells beginning with '?' # - otherwise keeping cell 'as is' # format='%a %b %d, %Y %T' format='%Y/%m/%d %T' new_row = [] if type(row) is not list: return new_row for c, cell in enumerate(row): if c == 0: # timestamp this row now = str(parse_date(str(datetime.utcnow())).strftime(format)) new_row.append(str(now)) continue if cell is None: new_row.append('null') continue if (len(cell) > 0) and (cell[0] == '?'): # only process cells with '?' as 1st char # - which cell is being referenced ??? # e.g. ?tweets(a1) how to grab a1 ? # - use code like chrToNum from sheet.html # - ensure only row 2, really 1, is referenced # remove '?', '(', ')', and leading/trailing whitespace: func_params = re.sub(r'[\?,),(]', r' ', cell).strip() # split by space, so parts [0]=func [1]=cell reference: parts = func_params.split(' ') afunc = parts[0] cell_ref = parts[1] # ignore any other params cell_ref_col_letter = cell_ref[0] # can be any col A-Za-z (0-25) cell_ref_col_num = ord(cell_ref_col_letter) % 32 - 1 cell_ref_row = cell_ref[1:] # this must ref row 2 only, but any col if cell_ref_row == '2': param = sheet_list[int(cell_ref_row)-1][int(cell_ref_col_num)] try: func_result = getattr(ServerSides, afunc)(param) result = func_result except Exception as e: result = "Error: running function: %s ... exception: %s" % (cell, e) new_row.append(str(result)) else: new_row.append(str("INVALID row: must be 2, but was (%s) in %s" % (cell_ref, cell))) else: # new_row.append(str(cell)) # use 'null' instead any existing cell value: new_row.append('null') return new_row
def moz_url_metrics(mozscape): results = [] guser = werkzeug_cache.get('guser') gs = Gspreadsheet(guser.gmail, guser.gpassword, None) gs.login() ss = gs.gclient.open_by_url(mozscape.gspread_link) ws = ss.sheet1 urls = gs.col_one(ws) # FIXME only use the first url at A2, for now url = urls[0] l = lsapi(flask_app.config['MOZSCAPE_API_ACCESS_ID'], flask_app.config['MOZSCAPE_API_SECRET_KEY']) try: # mozscape restriction is NOT to make parallel requests but batch them instead!!! now_timestamp = datetime.utcnow() nrow = 2 metrics = l.urlMetrics(url) # gspread update cells in row: acells = ws.range("B%s:L%s" % (nrow, nrow)) acells[0].value = metrics['uid'] acells[1].value = metrics['uu'] acells[2].value = metrics['ut'] acells[3].value = metrics['us'] acells[4].value = metrics['upa'] acells[5].value = metrics['ueid'] acells[6].value = metrics['umrp'] acells[7].value = metrics['umrr'] acells[8].value = metrics['fmrp'] acells[9].value = metrics['fmrr'] acells[10].value = metrics['pda'] ws.update_cells(acells) mr = MozscapeResult.create(name=mozscape.name, url=url, uid=metrics['uid'], uu=metrics['uu'], ut=metrics['ut'], us=metrics['us'], upa=metrics['upa'], ueid=metrics['ueid'], umrp=metrics['umrp'], umrr=metrics['umrr'], fmrp=metrics['fmrp'], fmrr=metrics['fmrr'], pda=metrics['pda'], timestamp=now_timestamp) except Exception as e: print("Error: moz_url_metrics:\n%s" % e) return len(results)
def odesk_rss(run_at, query): format='%Y/%m/%d %T' sheets = Sheet.select() for sheet in sheets: if sheet.name.lower()[0:9] != 'odesk rss': continue if sheet.run_at == 'never': continue elif sheet.run_at == run_at: processed_sheet = [] buf = StringIO(sheet.asheet) sheet_dict = json.load(buf) sheet_list = sheet_dict['data'] sheet_list = clear_sheet(sheet_list) try: feeder = feedparser.parse('https://www.odesk.com/jobs/rss?q=' + query) for posted_job in feeder.entries: new_row = [] # pubdate = str(parse_date(str(posted_job.published)).strftime(format)) # new_row.append(str(pubdate)) # new_row.append(str(posted_job.title)) # new_row.append(str(posted_job.link)) pubdate = parse_date(str(posted_job.published)).strftime(format) new_row.append(pubdate) new_row.append(posted_job.title.encode('ascii', 'ignore')) new_row.append(posted_job.link.encode('ascii', 'ignore')) # now = str(parse_date(str(datetime.utcnow())).strftime(format)) # new_row.append(str(now)) processed_sheet.append(new_row) except Exception as e: # ERROR: Error: running function: odesk_rss ... exception: 'ascii' codec can't encode character u'\u0421' in position 0: ordinal not in range(128) # a solution: ".encode('ascii', 'ignore')" as "str()" doesn't work error = "Error: running function: odesk_rss ... exception: %s" % e new_row.append(str(error)) processed_sheet.append(new_row) for ps in processed_sheet: sheet_list.append(ps) # store in db with the same format as handsontable i.e. JSON.stringify(hot.getData()): new_sheet_str = "{'data': " + str(sheet_list).replace('None', "").replace('null', "") + "}" new_sheet_str = new_sheet_str.replace("'", '"') sheet.asheet = new_sheet_str sheet.updated_at = datetime.utcnow() print("updated=%s name=%s"%(sheet.updated_at,sheet.name)) ssresp = sheet.save() print("sheet.save() response:") print(ssresp) print("%s Completed '%s' autorun of oDesk RSS job: query: '%s' processed sheet '%s'" % (datetime.utcnow(), run_at, query, sheet.name))
def jobs(): """ Run background job scheduler. This is just a simple scheduler with no persistence, and run as a separate process from the flask app. """ scheduler = BlockingScheduler() # ************* # schedule jobs: # ************* scheduler.add_job(social_counters, 'cron', day_of_week='*', hour=2) # 2am scheduler.add_job(youtube_channels, 'cron', day_of_week='*', hour=3) # 3am scheduler.add_job(crawlers, 'cron', day_of_week='*', hour=4) # 4am # ************* print('Job scheduler started at: %s'%datetime.utcnow()) scheduler.start()
def get_ports_info(hostname): print(request.form) if request.form['token'] == conf_obj.secret_token: ports_info = Connectlist.query.filter_by(hostname=hostname).first() if ports_info is None: add_new_host(hostname) ports_info = Connectlist.query.filter_by(hostname=hostname).first() return connect_list.jsonify(ports_info) if request.form['reconnect_status'] == 'True': kill_socket(hostname) return connect_list.jsonify(ports_info) # print('NOT THIS!') ports_info.last_connect_time = str(datetime.utcnow()) db.session.commit() return connect_list.jsonify(ports_info) else: return exit(-1)
def moz_index_metadata(): mim = None l = lsapi(flask_app.config['MOZSCAPE_API_ACCESS_ID'], flask_app.config['MOZSCAPE_API_SECRET_KEY']) try: now_timestamp = datetime.utcnow() try: mim = MozscapeIndexMetadata.get(MozscapeIndexMetadata.id == 1) except Exception as e: mim = MozscapeIndexMetadata() mim.timestamp = None print( "Error: moz_index_metadata: MozscapeIndexMetadata.get(MozscapeIndexMetadata.id==1)\n%s" % e) # do we need to update db or just return mim: if mim.timestamp is None or now_timestamp >= mim.next_update: metrics = l.index_metadata() mim.index_name = metrics['index_name'] mim.crawl_duration = metrics['crawl_duration'] mim.external_links_per_page = metrics[ 'external_links_per_page'] mim.links_per_page = metrics['links_per_page'] mim.links = metrics['links'] mim.plds = metrics['plds'] mim.fqdns = metrics['fqdns'] mim.nofollow = metrics['nofollow'] mim.urls = metrics['urls'] if str(metrics['locked']) == 'false': mim.locked = False else: mim.locked = True mim.rel_canonical = metrics['rel_canonical'] mim.last_update = datetime.fromtimestamp( metrics['last_update']) mim.next_update = datetime.fromtimestamp( metrics['next_update']) mim.timestamp = now_timestamp mim.save() # create or update except Exception as e: print("Error: moz_index_metadata:\n%s" % e) return mim
def jobs(): """ Run background job scheduler. This is just a simple scheduler with no job persistence(no Redis), and run as a separate process from the flask app. """ scheduler = BlockingScheduler() # ************* # schedule jobs: # ************* # see: http://apscheduler.readthedocs.org/en/latest/modules/triggers/cron.html # scheduler.add_job(trending, 'cron', day_of_week='*', hour=12) # every day at 12pm # scheduler.add_job(trending, 'cron', week='*', day_of_week='sun') # weekly on Sunday # scheduler.add_job(trending, 'cron', hour='*/6', args=['daily']) # every 6 hours scheduler.add_job(trending, 'cron', hour='18', args=['daily']) # every day at 6pm # scheduler.add_job(trending, 'cron', minute='*/1', args=['daily']) # scheduler.add_job(trending, 'cron', second='*/5', args=['daily']) # every 5 seconds scheduler.add_job(odesk_rss, 'cron', hour='12', args=['daily', 'rails']) # every day at noon # ************* print('Job scheduler started at: %s' % datetime.utcnow()) scheduler.start()
def fetch_counters(social_counter): guser = werkzeug_cache.get('guser') gs = Gspreadsheet(guser.gmail, guser.gpassword, None) gs.login() ss = gs.gclient.open_by_url(social_counter.gspread_link) ws = ss.sheet1 urls = gs.col_one(ws) results = [] try: pool = Pool(flask_app.config['MULTIPROCESSING_POOL_SIZE']) results = pool.map(SocialCount.get_url_data, urls) now_timestamp = datetime.utcnow() nrow = 2 for i in range(len(results)): # gspread update cells in row: acells = ws.range("B%s:H%s" % (nrow, nrow)) acells[0].value = results[i]['tweets'] acells[1].value = results[i]['plusses'] acells[2].value = results[i]['total_count'] acells[3].value = results[i]['share_count'] acells[4].value = results[i]['like_count'] acells[5].value = results[i]['comment_count'] acells[6].value = results[i]['click_count'] ws.update_cells(acells) c = SocialCount.create(name=social_counter.name, url=results[i]['url'], tweets=results[i]['tweets'], google_plusses=results[i]['plusses'], fb_total=results[i]['total_count'], fb_shares=results[i]['share_count'], fb_likes=results[i]['like_count'], fb_comments=results[i]['comment_count'], fb_clicks=results[i]['click_count'], timestamp=now_timestamp) nrow += 1 except Exception as e: print("Error: fetch_counters:\n%s" % e) return len(results)
def channels_create(): form = YoutubeChannelForm(request.form) form.validate() new_counter = True if form.errors: pass else: now = datetime.utcnow() counter = ChannelCounter.create( name=request.form.get('name'), runnable=request.form.get('runnable'), gspread_link=request.form.get('gspread_link'), channel=None, created_at=now, updated_at=now) new_counter = False form = YoutubeChannelForm(None, counter) flash('Video counter was created') return redirect(url_for('channels.channels_list')) return render_template('channel.html', current_user=current_user, form=form, new_counter=new_counter)
def functions_update(): # FIXME ugly code to do both edit/update if request.method == 'GET': id = request.args.get('id', '') func = UserFunction.get(UserFunction.id == id) form = UserFunctionForm(None, func) else: id = request.form.get('id') func = UserFunction.get(UserFunction.id == id) form = UserFunctionForm(request.form) form.validate() # params = request.form.get('params') # params_stripped = "\n".join([line.strip() for line in params.splitlines() if line.strip()]) if form.errors: pass else: now = datetime.utcnow() func.name = request.form.get('name') func.runnable = request.form.get('runnable') func.function = request.form.get('function') func.gspread_link = request.form.get('gspread_link') # func.params = params_stripped func.params = None func.updated_at = now func.save() new_function = False form = UserFunctionForm(None, func) flash('User function was updated') return redirect(url_for('functions.functions_list')) ufs = get_user_functions() return render_template('function.html', current_user=current_user, ufs=ufs, form=form, new_function=False, id=id)
def time_left(self): return self.time_stop - datetime.utcnow()
def __init__(self, username, password, email=None): self.username = username self.password = password self.mail = email self.creation_date = datetime.utcnow().replace(microsecond=0)
def youtube_channels(): counters = ChannelCounter.select() for c in counters: if c.is_runnable(): processed = Channel.scrape(c.name, c.urls) print("%s job: youtube channels processed" % datetime.utcnow())
def social_counters(): social_counters = SocialCounter.select() for sc in social_counters: if sc.is_runnable(): processed = SocialCount.fetch_counters(sc.name, sc.urls) print("%s job: social counters processed" % datetime.utcnow())
def crawl(crawler): guser = werkzeug_cache.get('guser') gs = Gspreadsheet(guser.gmail, guser.gpassword, None) gs.login() ss = gs.gclient.open_by_url(crawler.gspread_link) ws = ss.sheet1 urls = gs.col_one(ws) # only use the first url at A2, as it's probably # best to use a separate spreadsheet for each base url/site: url = urls[0] # url = 'http://104.236.92.144:8888/' # live demo restriction site = Page(url) pages = [] try: now_timestamp = datetime.utcnow( ) # give all pages crawled the same timestamp # start with row 2 col B for output: nrow = 2 for cp in site.crawl_and_analyze(): # "crawl_and_analyze" is a generator that yields a Page object: pages.append(cp) # gspread update cells in row: acells = ws.range("B%s:Q%s" % (nrow, nrow)) acells[0].value = cp.url acells[1].value = cp.title acells[2].value = cp.description acells[3].value = cp.keywords acells[4].value = '' for link in cp.links: acells[4].value += "%s\n" % link acells[5].value = "\n".join(cp.warnings) words = "" for word_count_tuple in cp.wordcount: words += "%s=%s\n" % (word_count_tuple[0], word_count_tuple[1]) acells[6].value = words try: acells[7].value = '' suggested_keywords = CrawlerPage.suggestions( "%s %s" % (cp.wordcount[0][0], cp.wordcount[1][0])) for suggested in suggested_keywords: for k, v in suggested.iteritems(): acells[7].value += "%s\n" % v except: acells[7].value = '' pass acells[7].value = acells[7].value.rstrip() try: acells[8].value = '' suggested_keywords = CrawlerPage.suggestions( "%s %s %s" % (cp.wordcount[0][0], cp.wordcount[1][0], cp.wordcount[2][0])) for suggested in suggested_keywords: for k, v in suggested.iteritems(): acells[8].value += "%s\n" % v except: acells[8].value = '' pass acells[9].value = cp.twitter_count acells[10].value = cp.googleplusones acells[11].value = cp.fb_total_count acells[12].value = cp.fb_share_count acells[13].value = cp.fb_like_count acells[14].value = cp.fb_comment_count acells[15].value = cp.fb_click_count ws.update_cells(acells) crawler_page = CrawlerPage.create( name=crawler.name, url=cp.url, sitemap=None, title_tag=cp.title, meta_description=cp.description, meta_keywords=cp.keywords, warnings=','.join(cp.warnings), h1_tag=None, a_tags=cp.links, img_tags=None, plain_text=cp.page_text, word_freqs=pickle.dumps(cp.wordcount), tweets=cp.twitter_count, google_plusses=cp.googleplusones, fb_total=cp.fb_total_count, fb_shares=cp.fb_share_count, fb_likes=cp.fb_like_count, fb_comments=cp.fb_comment_count, fb_clicks=cp.fb_click_count, timestamp=now_timestamp) nrow += 1 except Exception as e: print("Error: CrawlerPage:crawl:\n%s" % e) return len(pages)