def url_stahovak(logger, sys_path, cursor): sys.path.append(sys_path) from db_url_stahovak import download_and_insert URL = 9 while True: counter = 0 LIMIT = 1 cursor.execute("""select e.id, e.name, d.pubDate, d.pubTime from entities e, instances i, documents d WHERE e.enttype_id=%s AND NOT EXISTS (SELECT * FROM downloadurl WHERE entity_id=e.id) AND i.entity_id=e.id -- JOIN AND i.item_id=d.id -- JOIN ORDER BY (d.pubDate + d.pubTime) LIMIT %s;""", (URL, LIMIT)) instances = [i for i in cursor] if len( instances ) == 0: timer.sleep_to_tomorrow() continue for instance in instances: counter += 1 entity_id, entity_name, doc_pubdate, doc_pubtime = instance print ">>>", str(counter), entity_id, entity_name, doc_pubdate document_id = download_and_insert(entity_name, doc_pubdate, doc_pubtime) print ">>>", document_id cursor.execute("""INSERT INTO downloadurl (entity_id, document_id) VALUES (%s, %s)""", (entity_id, document_id))
def url_stahovak(logger, sys_path, cursor): sys.path.append(sys_path) from db_url_stahovak import download_and_insert URL = 9 while True: counter = 0 LIMIT = 1 cursor.execute( """select e.id, e.name, d.pubDate, d.pubTime from entities e, instances i, documents d WHERE e.enttype_id=%s AND NOT EXISTS (SELECT * FROM downloadurl WHERE entity_id=e.id) AND i.entity_id=e.id -- JOIN AND i.item_id=d.id -- JOIN ORDER BY (d.pubDate + d.pubTime) LIMIT %s;""", (URL, LIMIT)) instances = [i for i in cursor] if len(instances) == 0: timer.sleep_to_tomorrow() continue for instance in instances: counter += 1 entity_id, entity_name, doc_pubdate, doc_pubtime = instance print ">>>", str(counter), entity_id, entity_name, doc_pubdate document_id = download_and_insert(entity_name, doc_pubdate, doc_pubtime) print ">>>", document_id cursor.execute( """INSERT INTO downloadurl (entity_id, document_id) VALUES (%s, %s)""", (entity_id, document_id))
def cron(): """ Open `crontab` and run line per line (ignore empty lines and lines with leading #). """ while True: file = open(crontab, 'r') scripts = [x.strip() for x in file] file.close() for x in scripts: x = x.strip() if not x.startswith('#') and x: #log('running {%s}' % x) #retval = subprocess.call(shlex.split(x), shell=True) retval = os.system(x) if retval: log('return value = {%s} for line {%s}' % (retval, x)) log('*'*80) print 'going to sleep to tomorrow' timer.sleep_to_tomorrow()
def stanford(logger, sys_path, cursor, offset=0, limit=200): # stanford should be last running always # TODO timer.sleep_minutes(120) while True: if not cursor: raise ValueError('Cursor is not set!') sys.path.append(sys_path) warnings.filterwarnings('ignore') # ignoring for import from stanford import analyze warnings.filterwarnings('default') cursor.execute( """select id from documents where termvector is null and (_stanford is null OR _stanford=%s) and _calaised=%s and language in (%s, %s) and source_id in (select id from prefered_sources) order by id desc limit %s offset %s""", (False, True, "en", "de", limit, offset)) #cursor.execute('select id from documents where termvector is null limit 50') OK = 0 count = 0 for row in cursor: count += 1 id = row[0] logger.debug('analyzing id {%s}' % id) errmsg = analyze(int(id)) if errmsg: logger.info('error message = {%s}' % errmsg) else: OK += 1 logger.info('stanford analyzed: limit = {%s}, ok = {%s} => errors = {%s}' % \ (limit, OK, count-OK)) if not count: #return logger.debug('going to sleep to tomorrow') timer.sleep_to_tomorrow()
def stanford(logger, sys_path, cursor, offset=0, limit=200): # stanford should be last running always # TODO timer.sleep_minutes(120) while True: if not cursor: raise ValueError('Cursor is not set!') sys.path.append(sys_path) warnings.filterwarnings('ignore') # ignoring for import from stanford import analyze warnings.filterwarnings('default') cursor.execute("""select id from documents where termvector is null and (_stanford is null OR _stanford=%s) and _calaised=%s and language in (%s, %s) and source_id in (select id from prefered_sources) order by id desc limit %s offset %s""", (False, True, "en", "de", limit, offset)) #cursor.execute('select id from documents where termvector is null limit 50') OK = 0 count = 0 for row in cursor: count += 1 id = row[0] logger.debug('analyzing id {%s}' % id) errmsg = analyze(int(id)) if errmsg: logger.info('error message = {%s}' % errmsg) else: OK += 1 logger.info('stanford analyzed: limit = {%s}, ok = {%s} => errors = {%s}' % \ (limit, OK, count-OK)) if not count: #return logger.debug('going to sleep to tomorrow') timer.sleep_to_tomorrow()
try: p = subprocess.Popen(sys_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: logger.critical(e) return except Exception, e: logger.error(e) return (stdout, stderr) = p.communicate() if p.returncode: logger.error('return code = {%s}' % p.returncode) logger.error('stderr = {%s}' % stderr.strip()) logger.info('stdout = {%s}' % stderr.strip()) #return logger.debug('going to sleep to tomorrow') timer.sleep_to_tomorrow() @logDecorator @endlessDecorator(1) def url_stahovak(logger, sys_path, cursor): sys.path.append(sys_path) from db_url_stahovak import download_and_insert URL = 9 while True: counter = 0 LIMIT = 1 cursor.execute("""select e.id, e.name, d.pubDate,
stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: logger.critical(e) return except Exception, e: logger.error(e) return (stdout, stderr) = p.communicate() if p.returncode: logger.error('return code = {%s}' % p.returncode) logger.error('stderr = {%s}' % stderr.strip()) logger.info('stdout = {%s}' % stderr.strip()) #return logger.debug('going to sleep to tomorrow') timer.sleep_to_tomorrow() @logDecorator @endlessDecorator(1) def url_stahovak(logger, sys_path, cursor): sys.path.append(sys_path) from db_url_stahovak import download_and_insert URL = 9 while True: counter = 0 LIMIT = 1 cursor.execute( """select e.id, e.name,