Exemple #1
0
def store_analysis(results, iri):
    """Store results of the analysis in redis."""
    red = store_analysis.redis

    # results ... list of strings (json.dumps())
    if len(results) > 0:
        store = json.dumps({
            'analysis': [
                json.loads(x) for x in results
                if ((x is not None) and (len(x) > 0))
            ],
            'iri':
            iri
        })
    else:
        red.delete(data_key(iri))
        return

    key_result = analysis_dataset(iri)
    with red.pipeline() as pipe:
        pipe.set(key_result, store)
        pipe.sadd('purgeable', key_result)
        pipe.expire(key_result, expiration[KeyRoot.ANALYSIS])
        pipe.delete(
            data_key(iri))  # trash original content (index doesn't need it?)
        pipe.execute()
Exemple #2
0
    def gen_iri_guess(iri, r):
        deco = {
            'zip': decompress_7z,
            'gzip': decompress_gzip
        }
        for sub_iri in deco[type](iri, r, red):
            if red.sadd(key, sub_iri) == 0:
                log.debug(f'Skipping distribution as it was recently analyzed: {sub_iri!s}')
                continue

            sub_key = data_key(sub_iri)
            red.expire(sub_key, expiration[KeyRoot.DATA])
            red.sadd('purgeable', sub_key)
            # end_todo

            if sub_iri.endswith('/data'):  # extracted a file without a filename
                yield sub_iri, 'text/plain'  # this will allow for analysis to happen
                continue

            try:
                guess, _ = guess_format(sub_iri, r, log, red)
            except Skip:
                continue
            if guess is None:
                log.warn(f'Unknown format after decompression: {sub_iri}')
                red.expire(data_key(sub_iri), 1)
            else:
                yield sub_iri, guess
Exemple #3
0
def analyze(iri, format_guess):
    """Actually run the analyzer."""
    key = data_key(iri)
    # analyze.redis.sadd('processed', iri)
    tokens = [it.token for it in AbstractAnalyzer.__subclasses__()]
    chord(run_one_analyzer.si(token, key, format_guess)
          for token in tokens)(store_analysis.s(iri))
Exemple #4
0
 def gen_tasks(iri, r):
     lst = []
     try:
         for x in gen_iri_guess(iri, r): #this does the decompression
             lst.append(x)
     except SizeException as e:
         log.warn(f'One of the files in archive {iri} is too large ({e.name})')
         for sub_iri, _ in lst:
             log.debug(f'Expire {sub_iri}')
             red.expire(data_key(sub_iri), 1)
     except TypeError:
         log.exception(f'iri: {iri!s}, type: {type!s}')
     else:
         for sub_iri, guess in lst:
             yield index.si(sub_iri, guess)
             yield analyze.si(sub_iri, guess)
Exemple #5
0
def run_one_indexer(token, iri, format_guess):
    """Extract graph from redis and run indexer identified by token on it."""
    log = logging.getLogger(__name__)
    red = run_one_indexer.redis
    key = data_key(iri)

    log.debug('Parsing graph')
    try:
        g = rdflib.ConjunctiveGraph()
        g.parse(data=red.get(key), format=format_guess)
    except rdflib.plugin.PluginException:
        log.debug('Failed to parse graph')
        return 0
    except ValueError:
        log.debug('Failed to parse graph')
        return 0

    return run_indexer(token, iri, g, red)
Exemple #6
0
def store_content(iri, r, red):
    """Store contents into redis."""
    key = data_key(iri)
    if not red.exists(key):
        chsize = 1024
        conlen = 0
        with red.pipeline() as pipe:
            for chunk in r.iter_content(chunk_size=chsize):
                if chunk:
                    if len(chunk) + conlen > MAX_CONTENT_LENGTH:
                        pipe.delete(key)
                        pipe.execute()
                        raise SizeException(iri)
                    pipe.append(key, chunk)
                    conlen = conlen + len(chunk)
            pipe.expire(key, expiration[KeyRoot.DATA])
            pipe.sadd('purgeable', key)
            pipe.execute()
        monitor.log_size(conlen)
Exemple #7
0
def decompress_7z(iri, r, red):
    """Download a 7z file, decompress it and store contents in redis."""
    data = load_data(iri, r)
    log = logging.getLogger(__name__)

    expiration = expire_table[KeyRoot.DATA]
    deco_size_total = 0
    with libarchive.memory_reader(data) as archive:
        for entry in archive:
            try:
                name = str(entry)
            except:
                name = str(uuid.uuid4())
            if len(name) == 0:
                if iri.endswith('.zip'):
                    sub_iri = iri[:-4]
                else:
                    sub_iri = f'{iri}/{name}'
                    log.error(f'Empty name, iri: {iri!s}')
            else:
                sub_iri = f'{iri}/{name}'
            sub_key = data_key(sub_iri)
            log.debug(f'Store {name} into {sub_key}')
            conlen = 0
            if not red.exists(sub_key):
                red.sadd('purgeable', sub_key)
                for block in entry.get_blocks():
                    if len(block) + conlen > MAX_CONTENT_LENGTH:
                        # Will fail due to redis limitation
                        red.expire(sub_key, 0)
                        raise SizeException(name)

                    red.append(sub_key, block)
                    conlen = conlen + len(block)
                red.expire(sub_key, expiration)
                monitor.log_size(conlen)
                log.debug(f'Subfile has size {conlen}')
                deco_size_total = deco_size_total + conlen
            else:
                log.warn(f'Data already exists for {sub_iri}')
            if conlen > 0:
                yield sub_iri
    log.debug(f'Done decompression, total decompressed size {deco_size_total}')
Exemple #8
0
def decompress_gzip(iri, r, red):
    data = load_data(iri, r)

    expiration = expire_table[KeyRoot.DATA]
    if iri.endswith('.gz'):
        iri = iri[:-3]
    else:
        iri = iri + '/data'
    key = data_key(iri)
    decompressed = gzip.decompress(data)
    if len(decompressed) > MAX_CONTENT_LENGTH:
        raise SizeException(name)

    deco_size_total = red.set(key, decompressed)
    red.expire(key, expiration)
    monitor.log_size(deco_size_total)
    log = logging.getLogger(__name__)
    log.debug(f'Done decompression, total decompressed size {deco_size_total}')
    return f'{iri}'