コード例 #1
0
def odesk_rss(run_at, query):
    format = '%Y/%m/%d %T'
    sheets = Sheet.select()
    for sheet in sheets:
        if sheet.name.lower()[0:9] != 'odesk rss':
            continue
        if sheet.run_at == 'never':
            continue
        elif sheet.run_at == run_at:
            processed_sheet = []
            buf = StringIO(sheet.asheet)
            sheet_dict = json.load(buf)
            sheet_list = sheet_dict['data']
            sheet_list = clear_sheet(sheet_list)
            try:
                feeder = feedparser.parse('https://www.odesk.com/jobs/rss?q=' +
                                          query)
                for posted_job in feeder.entries:
                    new_row = []
                    # pubdate = str(parse_date(str(posted_job.published)).strftime(format))
                    # new_row.append(str(pubdate))
                    # new_row.append(str(posted_job.title))
                    # new_row.append(str(posted_job.link))
                    pubdate = parse_date(str(
                        posted_job.published)).strftime(format)
                    new_row.append(pubdate)
                    new_row.append(posted_job.title.encode('ascii', 'ignore'))
                    new_row.append(posted_job.link.encode('ascii', 'ignore'))
                    # now = str(parse_date(str(datetime.utcnow())).strftime(format))
                    # new_row.append(str(now))
                    processed_sheet.append(new_row)
            except Exception as e:
                # ERROR: Error: running function: odesk_rss ... exception: 'ascii' codec can't encode character u'\u0421' in position 0: ordinal not in range(128)
                # a solution: ".encode('ascii', 'ignore')" as "str()" doesn't work
                error = "Error: running function: odesk_rss ... exception: %s" % e
                new_row.append(str(error))
                processed_sheet.append(new_row)
            for ps in processed_sheet:
                sheet_list.append(ps)
            # store in db with the same format as handsontable i.e. JSON.stringify(hot.getData()):
            new_sheet_str = "{'data': " + str(sheet_list).replace(
                'None', "").replace('null', "") + "}"
            new_sheet_str = new_sheet_str.replace("'", '"')
            sheet.asheet = new_sheet_str
            sheet.updated_at = datetime.utcnow()
            print("updated=%s name=%s" % (sheet.updated_at, sheet.name))
            ssresp = sheet.save()
            print("sheet.save() response:")
            print(ssresp)
        print(
            "%s Completed '%s' autorun of oDesk RSS job: query: '%s' processed sheet '%s'"
            % (datetime.utcnow(), run_at, query, sheet.name))
コード例 #2
0
def trending(run_at):
    # a sheet template for trending must be like this:
    # row 1 is the headers
    # row 2:
    #   col 1+ are the function params (url/user) - may be multiples
    #   trailing col's are the ?func(x2)'s to be performed
    # after each run row 3+ will be the results:
    # row 3:
    #   col 1 is the timestamp
    #   col 2+ are the result of each ?func call
    # note: without some fixed layout this would difficult to perform, also
    #       since it's basically a report it would be ugly/unreadable if
    #       there wasn't a layout
    # print("%s run_at=%s"%(datetime.utcnow(),run_at))
    # return
    sheets = Sheet.select()
    for sheet in sheets:
        if sheet.name.lower()[0:9] == 'odesk rss':
            continue
        if sheet.run_at == 'never':
            continue
        elif sheet.run_at == run_at:
            processed_sheet = []
            buf = StringIO(sheet.asheet)
            sheet_dict = json.load(buf)
            sheet_list = sheet_dict['data']
            sheet_list = remove_empty_rows(sheet_list)
            for x, row in enumerate(sheet_list):
                # ignore the first row, x == 0
                # process the second row, x == 1
                # ignore all other rows/cols
                if (x == 0) or all_nones(row):
                    continue
                elif x == 1:  # only process row 2
                    processed_sheet.append(process_row(row, sheet_list))
                else:
                    continue
            for ps in processed_sheet:
                sheet_list.append(ps)
            # store in db with the same format as handsontable i.e. JSON.stringify(hot.getData()):
            new_sheet_str = "{'data': " + str(sheet_list).replace(
                'None', '').replace('null', '') + "}"
            new_sheet_str = new_sheet_str.replace("'", '"')
            sheet.asheet = new_sheet_str
            sheet.updated_at = datetime.utcnow()
            sheet.save()
        print("%s Completed autorun trending job: processed sheet '%s'" %
              (datetime.utcnow(), sheet.name))
コード例 #3
0
ファイル: models.py プロジェクト: creativeoj/blogz
 def __init__(self, title, content, owner, date=None):
     self.title = title
     self.content = content
     self.owner = owner
     if date is None:
         date = datetime.utcnow()
     self.date = date
コード例 #4
0
 def scrape(video_counter):
     guser = werkzeug_cache.get('guser')
     gs = Gspreadsheet(guser.gmail, guser.gpassword, None)
     gs.login()
     ss = gs.gclient.open_by_url(video_counter.gspread_link)
     ws = ss.sheet1
     urls = gs.col_one(ws)
     results = []
     try:
         pool = Pool(flask_app.config['MULTIPROCESSING_POOL_SIZE'])
         # video_page_urls = Channel.get_video_page_urls(channel)
         # results = pool.map(Channel.get_video_data, video_page_urls)
         results = pool.map(Channel.get_video_data, urls)
         now_timestamp = datetime.utcnow()
         nrow = 2
         for i in range(len(results)):
             # gspread update cells in row:
             acells = ws.range("B%s:E%s" % (nrow, nrow))
             acells[0].value = results[i]['title']
             acells[1].value = results[i]['views']
             acells[2].value = results[i]['likes']
             acells[3].value = results[i]['dislikes']
             ws.update_cells(acells)
             c = Channel.create(name=video_counter.name,
                                channel='',
                                url=results[i]['url'],
                                title=results[i]['title'],
                                views=results[i]['views'],
                                likes=results[i]['likes'],
                                dislikes=results[i]['dislikes'],
                                timestamp=now_timestamp)
             nrow += 1
     except Exception as e:
         print("Error: Channel:channel_scrape:\n%s" % e)
     return len(results)
コード例 #5
0
def crawlers_create():
    form = CrawlersForm(request.form)
    form.validate()
    new_crawler = True
    if form.errors:
        pass
    else:
        now = datetime.utcnow()
        crawler = Crawler.create(
            name=request.form.get('name'),
            runnable=request.form.get('runnable'),
            gspread_link=request.form.get('gspread_link'),
            # url=request.form.get('url'),
            url=None,
            crawl_status=None,
            crawled_at=None,
            created_at=now,
            updated_at=now)
        new_crawler = False
        form = CrawlersForm(None, crawler)
        flash('Crawler was created')
        return redirect(url_for('crawlers.crawlers_list'))
    return render_template('crawler.html',
                           current_user=current_user,
                           form=form,
                           new_crawler=new_crawler)
コード例 #6
0
def functions_create():
    form = UserFunctionForm(request.form)
    form.validate()
    # params = request.form.get('params')
    # params_stripped = "\n".join([ll.strip() for ll in params.splitlines() if ll.strip()])
    new_function = True
    if form.errors:
        pass
    else:
        now = datetime.utcnow()
        function = UserFunction.create(
            name=request.form.get('name'),
            runnable=request.form.get('runnable'),
            function=request.form.get('function'),
            gspread_link=request.form.get('gspread_link'),
            # params=params_stripped,
            params=None,
            created_at=now,
            updated_at=now)
        new_function = False
        form = UserFunctionForm(None, function)
        flash('User function was created')
        return redirect(url_for('functions.functions_list'))
    ufs = get_user_functions()
    return render_template('function.html',
                           current_user=current_user,
                           ufs=ufs,
                           form=form,
                           new_function=new_function)
コード例 #7
0
def crawlers_update():
    # FIXME ugly code to do both edit/update
    if request.method == 'GET':
        id = request.args.get('id', '')
        crawler = Crawler.get(Crawler.id == id)
        form = CrawlersForm(None, crawler)
    else:
        id = request.form.get('id')
        crawler = Crawler.get(Crawler.id == id)
        form = CrawlersForm(request.form)
        form.validate()
        if form.errors:
            pass
        else:
            now = datetime.utcnow()
            crawler.name = request.form.get('name')
            crawler.runnable = request.form.get('runnable')
            crawler.gspread_link = request.form.get('gspread_link')
            crawler.url = None
            crawler.updated_at = now
            crawler.save()
            new_crawler = False
            form = CrawlersForm(None, crawler)
            flash('Crawler was updated')
            return redirect(url_for('crawlers.crawlers_list'))
    return render_template('crawler.html',
                           current_user=current_user,
                           form=form,
                           new_crawler=False,
                           id=id)
コード例 #8
0
ファイル: jobs.py プロジェクト: cleesmith/cellipede
def trending(run_at):
  # a sheet template for trending must be like this:
  # row 1 is the headers
  # row 2:
  #   col 1+ are the function params (url/user) - may be multiples
  #   trailing col's are the ?func(x2)'s to be performed
  # after each run row 3+ will be the results:
  # row 3:
  #   col 1 is the timestamp
  #   col 2+ are the result of each ?func call
  # note: without some fixed layout this would difficult to perform, also
  #       since it's basically a report it would be ugly/unreadable if
  #       there wasn't a layout
  # print("%s run_at=%s"%(datetime.utcnow(),run_at))
  # return
  sheets = Sheet.select()
  for sheet in sheets:
    if sheet.name.lower()[0:9] == 'odesk rss':
      continue
    if sheet.run_at == 'never':
      continue
    elif sheet.run_at == run_at:
      processed_sheet = []
      buf = StringIO(sheet.asheet)
      sheet_dict = json.load(buf)
      sheet_list = sheet_dict['data']
      sheet_list = remove_empty_rows(sheet_list)
      for x, row in enumerate(sheet_list):
        # ignore the first row, x == 0
        # process the second row, x == 1
        # ignore all other rows/cols
        if (x == 0) or all_nones(row):
          continue
        elif x == 1: # only process row 2
          processed_sheet.append(process_row(row, sheet_list))
        else:
          continue
      for ps in processed_sheet:
        sheet_list.append(ps)
      # store in db with the same format as handsontable i.e. JSON.stringify(hot.getData()):
      new_sheet_str = "{'data': " + str(sheet_list).replace('None', '').replace('null', '') + "}"
      new_sheet_str = new_sheet_str.replace("'", '"')
      sheet.asheet = new_sheet_str
      sheet.updated_at = datetime.utcnow()
      sheet.save()
    print("%s Completed autorun trending job: processed sheet '%s'" % (datetime.utcnow(), sheet.name))
コード例 #9
0
def crawlers():
    crawlers = Crawler.select()
    for crawler in crawlers:
        if crawler.is_runnable():
            # delete crawler before crawling it again:
            dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name)
            deleted_count = dq.execute()
            pages = CrawlerPage.crawl(crawler)
    print("%s job: crawlers processed" % datetime.utcnow())
コード例 #10
0
 def perform_function(func):
   now_timestamp = datetime.utcnow()
   results = {}
   name = func.name
   function = func.function.split('(')
   guser = werkzeug_cache.get('guser')
   gs = Gspreadsheet(guser.gmail, guser.gpassword, None)
   gs.login()
   ss = gs.gclient.open_by_url(func.gspread_link)
   ws = ss.sheet1
   params = gs.col_one(ws)
   run_count = 1
   for fp in params:
     result = {}
     args = fp.split(', ')
     result['function'] = function[0]
     result['params'] = args
     try:
       # test error handling:
       # if fp == 'italian greyhound':
       #   raise Exception(fp)
       func_result = getattr(UserDefinedFunctions, function[0])(*args)
       result['result'] = func_result
     except Exception as e:
       aresult = "Exception: %s" % e
       result['result'] = [{'error' : aresult}]
       print("Error: running user function: %s\n\tException: %s" % (function[0], e))
       pass
     results[run_count] = result
     run_count += 1
   nrow = 2
   for key, func_result in results.items():
     try:
       is_error = 'error' in func_result['result'][0]
       # gspread update cells in row:
       acells = ws.range("B%s:F%s" % (nrow, nrow))
       acell = 0
       for fr in func_result['result']:
         if is_error:
           acells[acell].value = fr.get('error', '')
         else:
           acells[acell].value = fr.get('data', '')
         acell += 1
       ws.update_cells(acells)
       func_res = FunctionResult.create(
         tag='e' if is_error else 'k',
         name=name,
         function=func_result['function'],
         params=','.join(func_result['params']),
         results=pickle.dumps(func_result['result']),
         timestamp=now_timestamp
       )
       nrow += 1
     except Exception as e:
       print("Error: perform_function:\n%s" % e)
   return results
コード例 #11
0
def process_row(row, sheet_list):
    # overview:
    # - copy processed row to a new row by:
    #   - keeping empty/None cells
    #   - processing cells beginning with '?'
    #   - otherwise keeping cell 'as is'
    # format='%a %b %d, %Y %T'
    format = '%Y/%m/%d %T'
    new_row = []
    if type(row) is not list:
        return new_row
    for c, cell in enumerate(row):
        if c == 0:
            # timestamp this row
            now = str(parse_date(str(datetime.utcnow())).strftime(format))
            new_row.append(str(now))
            continue
        if cell is None:
            new_row.append('null')
            continue
        if (len(cell) > 0) and (cell[0] == '?'):
            # only process cells with '?' as 1st char
            #   - which cell is being referenced ???
            #     e.g. ?tweets(a1) how to grab a1 ?
            #     - use code like chrToNum from sheet.html
            #     - ensure only row 2, really 1, is referenced
            # remove '?', '(', ')', and leading/trailing whitespace:
            func_params = re.sub(r'[\?,),(]', r' ', cell).strip()
            # split by space, so parts [0]=func [1]=cell reference:
            parts = func_params.split(' ')
            afunc = parts[0]
            cell_ref = parts[1]  # ignore any other params
            cell_ref_col_letter = cell_ref[0]  # can be any col A-Za-z (0-25)
            cell_ref_col_num = ord(cell_ref_col_letter) % 32 - 1
            cell_ref_row = cell_ref[
                1:]  # this must ref row 2 only, but any col
            if cell_ref_row == '2':
                param = sheet_list[int(cell_ref_row) -
                                   1][int(cell_ref_col_num)]
                try:
                    func_result = getattr(ServerSides, afunc)(param)
                    result = func_result
                except Exception as e:
                    result = "Error: running function: %s ... exception: %s" % (
                        cell, e)
                new_row.append(str(result))
            else:
                new_row.append(
                    str("INVALID row: must be 2, but was (%s) in %s" %
                        (cell_ref, cell)))
        else:
            # new_row.append(str(cell)) # use 'null' instead any existing cell value:
            new_row.append('null')
    return new_row
コード例 #12
0
def async_spider(app, crawler_id):
    now = datetime.utcnow()
    print("%s async_spider started..." % datetime.utcnow())
    print("\tPID=%s" % os.getpid())
    print("\tcrawler_id=%s" % crawler_id)
    crawler = Crawler.get(Crawler.id == crawler_id)
    if crawler.is_runnable:
        # delete crawled pages before crawling it again:
        dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name)
        deleted_count = dq.execute()
        crawler.crawled_at = now
        crawler.crawl_status = 'crawling'
        crawler.save()
        pages_len = 0
        try:
            pages_len = CrawlerPage.crawl(crawler)
        finally:
            crawler.crawl_status = "crawled %s pages" % pages_len
            crawler.save()
            print("\tnumber of pages crawled=%s" % pages_len)
    print("%s async_spider ended" % datetime.utcnow())
コード例 #13
0
ファイル: jobs.py プロジェクト: cleesmith/cellipede
def process_row(row, sheet_list):
  # overview:
  # - copy processed row to a new row by:
  #   - keeping empty/None cells
  #   - processing cells beginning with '?'
  #   - otherwise keeping cell 'as is'
  # format='%a %b %d, %Y %T'
  format='%Y/%m/%d %T'
  new_row = []
  if type(row) is not list:
    return new_row
  for c, cell in enumerate(row):
    if c == 0:
      # timestamp this row
      now = str(parse_date(str(datetime.utcnow())).strftime(format))
      new_row.append(str(now))
      continue
    if cell is None:
      new_row.append('null')
      continue
    if (len(cell) > 0) and (cell[0] == '?'):
      # only process cells with '?' as 1st char
      #   - which cell is being referenced ???
      #     e.g. ?tweets(a1) how to grab a1 ?
      #     - use code like chrToNum from sheet.html
      #     - ensure only row 2, really 1, is referenced
      # remove '?', '(', ')', and leading/trailing whitespace:
      func_params = re.sub(r'[\?,),(]', r' ', cell).strip()
      # split by space, so parts [0]=func [1]=cell reference:
      parts = func_params.split(' ')
      afunc = parts[0]
      cell_ref = parts[1] # ignore any other params
      cell_ref_col_letter = cell_ref[0] # can be any col A-Za-z (0-25)
      cell_ref_col_num = ord(cell_ref_col_letter) % 32 - 1
      cell_ref_row = cell_ref[1:] # this must ref row 2 only, but any col
      if cell_ref_row == '2':
        param = sheet_list[int(cell_ref_row)-1][int(cell_ref_col_num)]
        try:
          func_result = getattr(ServerSides, afunc)(param)
          result = func_result
        except Exception as e:
          result = "Error: running function: %s ... exception: %s" % (cell, e)
        new_row.append(str(result))
      else:
        new_row.append(str("INVALID row: must be 2, but was (%s) in %s" % (cell_ref, cell)))
    else:
      # new_row.append(str(cell)) # use 'null' instead any existing cell value:
      new_row.append('null')
  return new_row
コード例 #14
0
 def moz_url_metrics(mozscape):
     results = []
     guser = werkzeug_cache.get('guser')
     gs = Gspreadsheet(guser.gmail, guser.gpassword, None)
     gs.login()
     ss = gs.gclient.open_by_url(mozscape.gspread_link)
     ws = ss.sheet1
     urls = gs.col_one(ws)
     # FIXME only use the first url at A2, for now
     url = urls[0]
     l = lsapi(flask_app.config['MOZSCAPE_API_ACCESS_ID'],
               flask_app.config['MOZSCAPE_API_SECRET_KEY'])
     try:
         # mozscape restriction is NOT to make parallel requests but batch them instead!!!
         now_timestamp = datetime.utcnow()
         nrow = 2
         metrics = l.urlMetrics(url)
         # gspread update cells in row:
         acells = ws.range("B%s:L%s" % (nrow, nrow))
         acells[0].value = metrics['uid']
         acells[1].value = metrics['uu']
         acells[2].value = metrics['ut']
         acells[3].value = metrics['us']
         acells[4].value = metrics['upa']
         acells[5].value = metrics['ueid']
         acells[6].value = metrics['umrp']
         acells[7].value = metrics['umrr']
         acells[8].value = metrics['fmrp']
         acells[9].value = metrics['fmrr']
         acells[10].value = metrics['pda']
         ws.update_cells(acells)
         mr = MozscapeResult.create(name=mozscape.name,
                                    url=url,
                                    uid=metrics['uid'],
                                    uu=metrics['uu'],
                                    ut=metrics['ut'],
                                    us=metrics['us'],
                                    upa=metrics['upa'],
                                    ueid=metrics['ueid'],
                                    umrp=metrics['umrp'],
                                    umrr=metrics['umrr'],
                                    fmrp=metrics['fmrp'],
                                    fmrr=metrics['fmrr'],
                                    pda=metrics['pda'],
                                    timestamp=now_timestamp)
     except Exception as e:
         print("Error: moz_url_metrics:\n%s" % e)
     return len(results)
コード例 #15
0
ファイル: jobs.py プロジェクト: cleesmith/cellipede
def odesk_rss(run_at, query):
  format='%Y/%m/%d %T'
  sheets = Sheet.select()
  for sheet in sheets:
    if sheet.name.lower()[0:9] != 'odesk rss':
      continue
    if sheet.run_at == 'never':
      continue
    elif sheet.run_at == run_at:
      processed_sheet = []
      buf = StringIO(sheet.asheet)
      sheet_dict = json.load(buf)
      sheet_list = sheet_dict['data']
      sheet_list = clear_sheet(sheet_list)
      try:
        feeder = feedparser.parse('https://www.odesk.com/jobs/rss?q=' + query)
        for posted_job in feeder.entries:
          new_row = []
          # pubdate = str(parse_date(str(posted_job.published)).strftime(format))
          # new_row.append(str(pubdate))
          # new_row.append(str(posted_job.title))
          # new_row.append(str(posted_job.link))
          pubdate = parse_date(str(posted_job.published)).strftime(format)
          new_row.append(pubdate)
          new_row.append(posted_job.title.encode('ascii', 'ignore'))
          new_row.append(posted_job.link.encode('ascii', 'ignore'))
          # now = str(parse_date(str(datetime.utcnow())).strftime(format))
          # new_row.append(str(now))
          processed_sheet.append(new_row)
      except Exception as e:
        # ERROR: Error: running function: odesk_rss ... exception: 'ascii' codec can't encode character u'\u0421' in position 0: ordinal not in range(128)
        # a solution: ".encode('ascii', 'ignore')" as "str()" doesn't work
        error = "Error: running function: odesk_rss ... exception: %s" % e
        new_row.append(str(error))
        processed_sheet.append(new_row)
      for ps in processed_sheet:
        sheet_list.append(ps)
      # store in db with the same format as handsontable i.e. JSON.stringify(hot.getData()):
      new_sheet_str = "{'data': " + str(sheet_list).replace('None', "").replace('null', "") + "}"
      new_sheet_str = new_sheet_str.replace("'", '"')
      sheet.asheet = new_sheet_str
      sheet.updated_at = datetime.utcnow()
      print("updated=%s name=%s"%(sheet.updated_at,sheet.name))
      ssresp = sheet.save()
      print("sheet.save() response:")
      print(ssresp)
    print("%s Completed '%s' autorun of oDesk RSS job: query: '%s' processed sheet '%s'" % (datetime.utcnow(), run_at, query, sheet.name))
コード例 #16
0
def jobs():
  """
    Run background job scheduler.
    This is just a simple scheduler with no persistence,
    and run as a separate process from the flask app.
  """
  scheduler = BlockingScheduler()

  # *************
  # schedule jobs:
  # *************
  scheduler.add_job(social_counters, 'cron', day_of_week='*', hour=2) # 2am
  scheduler.add_job(youtube_channels, 'cron', day_of_week='*', hour=3) # 3am
  scheduler.add_job(crawlers, 'cron', day_of_week='*', hour=4) # 4am
  # *************

  print('Job scheduler started at: %s'%datetime.utcnow())
  scheduler.start()
コード例 #17
0
def get_ports_info(hostname):
    print(request.form)
    if request.form['token'] == conf_obj.secret_token:
        ports_info = Connectlist.query.filter_by(hostname=hostname).first()
        if ports_info is None:
            add_new_host(hostname)
            ports_info = Connectlist.query.filter_by(hostname=hostname).first()
            return connect_list.jsonify(ports_info)

        if request.form['reconnect_status'] == 'True':
            kill_socket(hostname)
            return connect_list.jsonify(ports_info)

        # print('NOT THIS!')
        ports_info.last_connect_time = str(datetime.utcnow())
        db.session.commit()
        return connect_list.jsonify(ports_info)
    else:
        return exit(-1)
コード例 #18
0
 def moz_index_metadata():
     mim = None
     l = lsapi(flask_app.config['MOZSCAPE_API_ACCESS_ID'],
               flask_app.config['MOZSCAPE_API_SECRET_KEY'])
     try:
         now_timestamp = datetime.utcnow()
         try:
             mim = MozscapeIndexMetadata.get(MozscapeIndexMetadata.id == 1)
         except Exception as e:
             mim = MozscapeIndexMetadata()
             mim.timestamp = None
             print(
                 "Error: moz_index_metadata: MozscapeIndexMetadata.get(MozscapeIndexMetadata.id==1)\n%s"
                 % e)
         # do we need to update db or just return mim:
         if mim.timestamp is None or now_timestamp >= mim.next_update:
             metrics = l.index_metadata()
             mim.index_name = metrics['index_name']
             mim.crawl_duration = metrics['crawl_duration']
             mim.external_links_per_page = metrics[
                 'external_links_per_page']
             mim.links_per_page = metrics['links_per_page']
             mim.links = metrics['links']
             mim.plds = metrics['plds']
             mim.fqdns = metrics['fqdns']
             mim.nofollow = metrics['nofollow']
             mim.urls = metrics['urls']
             if str(metrics['locked']) == 'false':
                 mim.locked = False
             else:
                 mim.locked = True
             mim.rel_canonical = metrics['rel_canonical']
             mim.last_update = datetime.fromtimestamp(
                 metrics['last_update'])
             mim.next_update = datetime.fromtimestamp(
                 metrics['next_update'])
             mim.timestamp = now_timestamp
             mim.save()  # create or update
     except Exception as e:
         print("Error: moz_index_metadata:\n%s" % e)
     return mim
コード例 #19
0
ファイル: manage.py プロジェクト: cleesmith/cellipede
def jobs():
    """ Run background job scheduler.
      This is just a simple scheduler with no job persistence(no Redis),
      and run as a separate process from the flask app.
  """
    scheduler = BlockingScheduler()
    # *************
    # schedule jobs:
    # *************
    # see: http://apscheduler.readthedocs.org/en/latest/modules/triggers/cron.html
    # scheduler.add_job(trending, 'cron', day_of_week='*', hour=12) # every day at 12pm
    # scheduler.add_job(trending, 'cron', week='*', day_of_week='sun') # weekly on Sunday
    # scheduler.add_job(trending, 'cron', hour='*/6', args=['daily']) # every 6 hours
    scheduler.add_job(trending, 'cron', hour='18',
                      args=['daily'])  # every day at 6pm
    # scheduler.add_job(trending, 'cron', minute='*/1', args=['daily'])
    # scheduler.add_job(trending, 'cron', second='*/5', args=['daily']) # every 5 seconds
    scheduler.add_job(odesk_rss, 'cron', hour='12',
                      args=['daily', 'rails'])  # every day at noon
    # *************
    print('Job scheduler started at: %s' % datetime.utcnow())
    scheduler.start()
コード例 #20
0
 def fetch_counters(social_counter):
     guser = werkzeug_cache.get('guser')
     gs = Gspreadsheet(guser.gmail, guser.gpassword, None)
     gs.login()
     ss = gs.gclient.open_by_url(social_counter.gspread_link)
     ws = ss.sheet1
     urls = gs.col_one(ws)
     results = []
     try:
         pool = Pool(flask_app.config['MULTIPROCESSING_POOL_SIZE'])
         results = pool.map(SocialCount.get_url_data, urls)
         now_timestamp = datetime.utcnow()
         nrow = 2
         for i in range(len(results)):
             # gspread update cells in row:
             acells = ws.range("B%s:H%s" % (nrow, nrow))
             acells[0].value = results[i]['tweets']
             acells[1].value = results[i]['plusses']
             acells[2].value = results[i]['total_count']
             acells[3].value = results[i]['share_count']
             acells[4].value = results[i]['like_count']
             acells[5].value = results[i]['comment_count']
             acells[6].value = results[i]['click_count']
             ws.update_cells(acells)
             c = SocialCount.create(name=social_counter.name,
                                    url=results[i]['url'],
                                    tweets=results[i]['tweets'],
                                    google_plusses=results[i]['plusses'],
                                    fb_total=results[i]['total_count'],
                                    fb_shares=results[i]['share_count'],
                                    fb_likes=results[i]['like_count'],
                                    fb_comments=results[i]['comment_count'],
                                    fb_clicks=results[i]['click_count'],
                                    timestamp=now_timestamp)
             nrow += 1
     except Exception as e:
         print("Error: fetch_counters:\n%s" % e)
     return len(results)
コード例 #21
0
def channels_create():
    form = YoutubeChannelForm(request.form)
    form.validate()
    new_counter = True
    if form.errors:
        pass
    else:
        now = datetime.utcnow()
        counter = ChannelCounter.create(
            name=request.form.get('name'),
            runnable=request.form.get('runnable'),
            gspread_link=request.form.get('gspread_link'),
            channel=None,
            created_at=now,
            updated_at=now)
        new_counter = False
        form = YoutubeChannelForm(None, counter)
        flash('Video counter was created')
        return redirect(url_for('channels.channels_list'))
    return render_template('channel.html',
                           current_user=current_user,
                           form=form,
                           new_counter=new_counter)
コード例 #22
0
def functions_update():
    # FIXME ugly code to do both edit/update
    if request.method == 'GET':
        id = request.args.get('id', '')
        func = UserFunction.get(UserFunction.id == id)
        form = UserFunctionForm(None, func)
    else:
        id = request.form.get('id')
        func = UserFunction.get(UserFunction.id == id)
        form = UserFunctionForm(request.form)
        form.validate()
        # params = request.form.get('params')
        # params_stripped = "\n".join([line.strip() for line in params.splitlines() if line.strip()])
        if form.errors:
            pass
        else:
            now = datetime.utcnow()
            func.name = request.form.get('name')
            func.runnable = request.form.get('runnable')
            func.function = request.form.get('function')
            func.gspread_link = request.form.get('gspread_link')
            # func.params = params_stripped
            func.params = None
            func.updated_at = now
            func.save()
            new_function = False
            form = UserFunctionForm(None, func)
            flash('User function was updated')
            return redirect(url_for('functions.functions_list'))
    ufs = get_user_functions()
    return render_template('function.html',
                           current_user=current_user,
                           ufs=ufs,
                           form=form,
                           new_function=False,
                           id=id)
コード例 #23
0
ファイル: models.py プロジェクト: IgneaVentus/python
 def time_left(self):
     return self.time_stop - datetime.utcnow()
コード例 #24
0
ファイル: models.py プロジェクト: IgneaVentus/python
 def __init__(self, username, password, email=None):
     self.username = username
     self.password = password
     self.mail = email
     self.creation_date = datetime.utcnow().replace(microsecond=0)
コード例 #25
0
def youtube_channels():
    counters = ChannelCounter.select()
    for c in counters:
        if c.is_runnable():
            processed = Channel.scrape(c.name, c.urls)
    print("%s job: youtube channels processed" % datetime.utcnow())
コード例 #26
0
def social_counters():
    social_counters = SocialCounter.select()
    for sc in social_counters:
        if sc.is_runnable():
            processed = SocialCount.fetch_counters(sc.name, sc.urls)
    print("%s job: social counters processed" % datetime.utcnow())
コード例 #27
0
 def crawl(crawler):
     guser = werkzeug_cache.get('guser')
     gs = Gspreadsheet(guser.gmail, guser.gpassword, None)
     gs.login()
     ss = gs.gclient.open_by_url(crawler.gspread_link)
     ws = ss.sheet1
     urls = gs.col_one(ws)
     # only use the first url at A2, as it's probably
     # best to use a separate spreadsheet for each base url/site:
     url = urls[0]
     # url = 'http://104.236.92.144:8888/' # live demo restriction
     site = Page(url)
     pages = []
     try:
         now_timestamp = datetime.utcnow(
         )  # give all pages crawled the same timestamp
         # start with row 2 col B for output:
         nrow = 2
         for cp in site.crawl_and_analyze():
             # "crawl_and_analyze" is a generator that yields a Page object:
             pages.append(cp)
             # gspread update cells in row:
             acells = ws.range("B%s:Q%s" % (nrow, nrow))
             acells[0].value = cp.url
             acells[1].value = cp.title
             acells[2].value = cp.description
             acells[3].value = cp.keywords
             acells[4].value = ''
             for link in cp.links:
                 acells[4].value += "%s\n" % link
             acells[5].value = "\n".join(cp.warnings)
             words = ""
             for word_count_tuple in cp.wordcount:
                 words += "%s=%s\n" % (word_count_tuple[0],
                                       word_count_tuple[1])
             acells[6].value = words
             try:
                 acells[7].value = ''
                 suggested_keywords = CrawlerPage.suggestions(
                     "%s %s" % (cp.wordcount[0][0], cp.wordcount[1][0]))
                 for suggested in suggested_keywords:
                     for k, v in suggested.iteritems():
                         acells[7].value += "%s\n" % v
             except:
                 acells[7].value = ''
                 pass
             acells[7].value = acells[7].value.rstrip()
             try:
                 acells[8].value = ''
                 suggested_keywords = CrawlerPage.suggestions(
                     "%s %s %s" % (cp.wordcount[0][0], cp.wordcount[1][0],
                                   cp.wordcount[2][0]))
                 for suggested in suggested_keywords:
                     for k, v in suggested.iteritems():
                         acells[8].value += "%s\n" % v
             except:
                 acells[8].value = ''
                 pass
             acells[9].value = cp.twitter_count
             acells[10].value = cp.googleplusones
             acells[11].value = cp.fb_total_count
             acells[12].value = cp.fb_share_count
             acells[13].value = cp.fb_like_count
             acells[14].value = cp.fb_comment_count
             acells[15].value = cp.fb_click_count
             ws.update_cells(acells)
             crawler_page = CrawlerPage.create(
                 name=crawler.name,
                 url=cp.url,
                 sitemap=None,
                 title_tag=cp.title,
                 meta_description=cp.description,
                 meta_keywords=cp.keywords,
                 warnings=','.join(cp.warnings),
                 h1_tag=None,
                 a_tags=cp.links,
                 img_tags=None,
                 plain_text=cp.page_text,
                 word_freqs=pickle.dumps(cp.wordcount),
                 tweets=cp.twitter_count,
                 google_plusses=cp.googleplusones,
                 fb_total=cp.fb_total_count,
                 fb_shares=cp.fb_share_count,
                 fb_likes=cp.fb_like_count,
                 fb_comments=cp.fb_comment_count,
                 fb_clicks=cp.fb_click_count,
                 timestamp=now_timestamp)
             nrow += 1
     except Exception as e:
         print("Error: CrawlerPage:crawl:\n%s" % e)
     return len(pages)