Esempio n. 1
0
def url_stahovak(logger, sys_path, cursor):
    sys.path.append(sys_path)
    from db_url_stahovak import download_and_insert
    URL = 9
    while True:
        counter = 0
        LIMIT = 1
        cursor.execute("""select 
                        e.id, 
                        e.name, 
                        d.pubDate,
                        d.pubTime
                        from entities e, instances i, documents d
                        WHERE
                        e.enttype_id=%s
                        AND NOT EXISTS (SELECT * FROM downloadurl WHERE entity_id=e.id) 
                        AND i.entity_id=e.id -- JOIN
                        AND i.item_id=d.id   -- JOIN
                        ORDER BY (d.pubDate + d.pubTime)
                        LIMIT %s;""", (URL, LIMIT))
        instances = [i for i in cursor]
        if len( instances ) == 0:
            timer.sleep_to_tomorrow()
            continue
        for instance in instances:
            counter += 1
            entity_id, entity_name, doc_pubdate, doc_pubtime = instance
            print ">>>", str(counter), entity_id, entity_name, doc_pubdate
            document_id = download_and_insert(entity_name, doc_pubdate, doc_pubtime)
            print ">>>", document_id
            cursor.execute("""INSERT INTO downloadurl (entity_id, document_id)
                              VALUES (%s, %s)""", (entity_id, document_id))
Esempio n. 2
0
def url_stahovak(logger, sys_path, cursor):
    sys.path.append(sys_path)
    from db_url_stahovak import download_and_insert
    URL = 9
    while True:
        counter = 0
        LIMIT = 1
        cursor.execute(
            """select 
                        e.id, 
                        e.name, 
                        d.pubDate,
                        d.pubTime
                        from entities e, instances i, documents d
                        WHERE
                        e.enttype_id=%s
                        AND NOT EXISTS (SELECT * FROM downloadurl WHERE entity_id=e.id) 
                        AND i.entity_id=e.id -- JOIN
                        AND i.item_id=d.id   -- JOIN
                        ORDER BY (d.pubDate + d.pubTime)
                        LIMIT %s;""", (URL, LIMIT))
        instances = [i for i in cursor]
        if len(instances) == 0:
            timer.sleep_to_tomorrow()
            continue
        for instance in instances:
            counter += 1
            entity_id, entity_name, doc_pubdate, doc_pubtime = instance
            print ">>>", str(counter), entity_id, entity_name, doc_pubdate
            document_id = download_and_insert(entity_name, doc_pubdate,
                                              doc_pubtime)
            print ">>>", document_id
            cursor.execute(
                """INSERT INTO downloadurl (entity_id, document_id)
                              VALUES (%s, %s)""", (entity_id, document_id))
Esempio n. 3
0
def cron():
    """
    Open `crontab` and run line per line 
        (ignore empty lines and lines with leading #).
    """
    while True:
        file = open(crontab, 'r')
        scripts = [x.strip() for x in file]
        file.close()
        for x in scripts:
            x = x.strip()
            if not x.startswith('#') and x:
                #log('running {%s}' % x)
                #retval = subprocess.call(shlex.split(x), shell=True)
                retval = os.system(x)
                if retval:
                    log('return value = {%s} for line {%s}' % (retval, x))
                    log('*'*80)
        print 'going to sleep to tomorrow'
        timer.sleep_to_tomorrow()
Esempio n. 4
0
def stanford(logger, sys_path, cursor, offset=0, limit=200):
    # stanford should be last running always
    # TODO timer.sleep_minutes(120)
    while True:
        if not cursor:
            raise ValueError('Cursor is not set!')

        sys.path.append(sys_path)
        warnings.filterwarnings('ignore')  # ignoring for import
        from stanford import analyze
        warnings.filterwarnings('default')
        cursor.execute(
            """select id from documents 
                          where termvector is null 
                                and (_stanford is null OR _stanford=%s)
                                and _calaised=%s
                                and language in (%s, %s)
                                and source_id in (select id from prefered_sources)
                          order by id desc
                          limit %s 
                          offset %s""",
            (False, True, "en", "de", limit, offset))
        #cursor.execute('select id from documents where termvector is null limit 50')
        OK = 0
        count = 0
        for row in cursor:
            count += 1
            id = row[0]
            logger.debug('analyzing id {%s}' % id)
            errmsg = analyze(int(id))
            if errmsg:
                logger.info('error message = {%s}' % errmsg)
            else:
                OK += 1
        logger.info('stanford analyzed: limit = {%s}, ok = {%s} => errors = {%s}' % \
                    (limit, OK, count-OK))
        if not count:
            #return
            logger.debug('going to sleep to tomorrow')
            timer.sleep_to_tomorrow()
Esempio n. 5
0
def stanford(logger, sys_path, cursor, offset=0, limit=200):
    # stanford should be last running always
    # TODO timer.sleep_minutes(120)
    while True:
        if not cursor:
            raise ValueError('Cursor is not set!')

        sys.path.append(sys_path)
        warnings.filterwarnings('ignore') # ignoring for import
        from stanford import analyze
        warnings.filterwarnings('default')
        cursor.execute("""select id from documents 
                          where termvector is null 
                                and (_stanford is null OR _stanford=%s)
                                and _calaised=%s
                                and language in (%s, %s)
                                and source_id in (select id from prefered_sources)
                          order by id desc
                          limit %s 
                          offset %s""", 
                        (False, True, "en", "de", limit, offset))
        #cursor.execute('select id from documents where termvector is null limit 50')
        OK = 0
        count = 0
        for row in cursor:
            count += 1
            id = row[0]
            logger.debug('analyzing id {%s}' % id)
            errmsg = analyze(int(id))
            if errmsg:
                logger.info('error message = {%s}' % errmsg)
            else:
                OK += 1
        logger.info('stanford analyzed: limit = {%s}, ok = {%s} => errors = {%s}' % \
                    (limit, OK, count-OK))
        if not count:
            #return
            logger.debug('going to sleep to tomorrow')
            timer.sleep_to_tomorrow()
Esempio n. 6
0
        try:
            p = subprocess.Popen(sys_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        except OSError, e:
            logger.critical(e)
            return
        except Exception, e:
            logger.error(e)
            return
        (stdout, stderr) = p.communicate()
        if p.returncode:
            logger.error('return code = {%s}' % p.returncode)
        logger.error('stderr = {%s}' % stderr.strip())
        logger.info('stdout = {%s}' % stderr.strip())
        #return
        logger.debug('going to sleep to tomorrow')
        timer.sleep_to_tomorrow()


@logDecorator
@endlessDecorator(1)
def url_stahovak(logger, sys_path, cursor):
    sys.path.append(sys_path)
    from db_url_stahovak import download_and_insert
    URL = 9
    while True:
        counter = 0
        LIMIT = 1
        cursor.execute("""select 
                        e.id, 
                        e.name, 
                        d.pubDate,
Esempio n. 7
0
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
        except OSError, e:
            logger.critical(e)
            return
        except Exception, e:
            logger.error(e)
            return
        (stdout, stderr) = p.communicate()
        if p.returncode:
            logger.error('return code = {%s}' % p.returncode)
        logger.error('stderr = {%s}' % stderr.strip())
        logger.info('stdout = {%s}' % stderr.strip())
        #return
        logger.debug('going to sleep to tomorrow')
        timer.sleep_to_tomorrow()


@logDecorator
@endlessDecorator(1)
def url_stahovak(logger, sys_path, cursor):
    sys.path.append(sys_path)
    from db_url_stahovak import download_and_insert
    URL = 9
    while True:
        counter = 0
        LIMIT = 1
        cursor.execute(
            """select 
                        e.id, 
                        e.name,