Esempio n. 1
0
def scrape():
    logger.info("starting session")

    # Create the missing DataSources.
    existing_domains = [ds[0] for ds in db.query(DataSource.domain).all()]
    for domain in sources.SOURCES.keys():
        if domain not in existing_domains:
            logger.info("creating new data source, domain = %s", domain)
            new_ds = DataSource(domain=domain)
            db.merge(new_ds)
    db.commit()

    active_ds = db.query(DataSource.id).filter(DataSource.active == True).all()
    data_sources = [ds[0] for ds in active_ds]

    loop = asyncio.get_event_loop()

    # Change the default executor so there are enough workers to handle the
    # request concurrency.
    executor = concurrent.futures.ThreadPoolExecutor(
        max_workers=settings.REQUEST_CONCURRENCY + 5
    )
    loop.set_default_executor(executor)
    loop.request_semaphore = asyncio.Semaphore(settings.REQUEST_CONCURRENCY)

    for data_source_id in data_sources:
        loop.create_task(scrape_entries(data_source_id))

    loop.run_forever()
    loop.close()
Esempio n. 2
0
def view_result(embedding_id, testset_id):
    # Calculate the ranking of the embedding on the testset.
    partition = func.rank().over(partition_by=Result.testset_id,
                                 order_by=Result.accuracy.desc()).label('rank')
    sq = db.query(Result, partition).subquery()

    result = db.query(sq).filter(sq.c.embedding_id == embedding_id,
                                 sq.c.testset_id == testset_id).first()

    if not result:
        abort(404)

    return jsonify(data=serialize_result(result, summary=False))
Esempio n. 3
0
def download_embedding(embedding_id):
    embedding = db.query(Embedding).get(embedding_id)
    if not embedding:
        abort(404)

    def content_streamer():
        z = zipstream.ZipFile(mode='w',
                              compression=zipstream.ZIP_DEFLATED,
                              allowZip64=True)

        open_files = []
        for path in embedding.get_all_files():
            current_file = open(path, 'rb')
            open_files.append(current_file)

            # Get the actual file name.
            file_name = os.path.split(path)[-1]
            z.write_iter(file_name, current_file)

        for chunk in z:
            yield chunk

        for f in open_files:
            f.close()

    response = Response(content_streamer(), mimetype="application/zip")
    disposition = "attachment; filename={}.zip".format(embedding.file_name)
    response.headers['Content-Disposition'] = disposition
    return response
Esempio n. 4
0
def list_testsets():
    testsets = db.query(TestSet).all()

    data = [serialize_testset(tst) for tst in testsets]
    meta = {'count': len(data)}

    return jsonify(data=data, meta=meta)
Esempio n. 5
0
def test(self, testing_job_id):
    testing_job = db.query(TestingJob).get(testing_job_id)
    if not testing_job:
        raise Exception("TestingJob doesn't exist")

    # Update testing job's task_id.
    testing_job.task_id = test.request.id
    embedding = testing_job.embedding
    testset = testing_job.testset

    db.commit()

    def report(progress):
        self.update_state(state='PROGRESS', meta={'progress': progress})

    # Initial progress report so it happens before loading the model.
    report(0.0)

    start_time = time.time()
    result = evaluate(embedding, testset, report=report)
    end_time = time.time()

    result.testing_job = testing_job
    testing_job.task_id = None
    testing_job.elapsed_time = int(end_time - start_time)
    db.commit()
Esempio n. 6
0
def main():
    embeddings = db.query(Embedding).all()
    for embedding in embeddings:
        if embedding.model == 'glove':
            migrate_glove(embedding)
        elif embedding.model == 'word2vec':
            migrate_word2vec(embedding)
Esempio n. 7
0
def train(self, training_job_id):

    training_job = db.query(TrainingJob).get(training_job_id)
    if not training_job:
        raise Exception("TrainingJob doesn't exist")

    training_job.task_id = train.request.id
    embedding = training_job.embedding
    db.commit()

    def report(progress):
        self.update_state(state='PROGRESS', meta={'progress': progress})

    start_time = time.time()
    train_model(
        embedding.model,
        embedding.query,
        embedding.preprocessing,
        embedding.parameters,
        embedding.file_name,
        report
    )
    end_time = time.time()

    training_job.task_id = None
    training_job.elapsed_time = int(end_time - start_time)
    embedding = training_job.embedding
    embedding.status = 'TRAINED'
    db.commit()
Esempio n. 8
0
def list_testing_jobs():
    status = request.args.get('status', None)
    embedding_id = request.args.get('embedding', None)
    testset_id = request.args.get('testset', None)

    # Failed jobs are included in `queued`, as their `elapsed_time` will be
    # None too.
    query = db.query(TestingJob)
    if status == 'finished':
        query = query.filter(~TestingJob.elapsed_time.is_(None))
    elif status == 'queued':
        query = query.filter(TestingJob.elapsed_time.is_(None))

    # Filter by embedding/testset.
    if embedding_id:
        query = query.filter(TestingJob.embedding_id == int(embedding_id))
    if testset_id:
        query = query.filter(TestingJob.testset_id == int(testset_id))

    testing_jobs = query.all()

    data = [serialize_testing_job(tj) for tj in testing_jobs]
    meta = {'count': len(data)}

    return jsonify(data=data, meta=meta)
Esempio n. 9
0
def list_embeddings():
    embeddings = db.query(Embedding).all()

    data = [serialize_embedding(emb) for emb in embeddings]
    meta = {'count': len(data)}

    return jsonify(data=data, meta=meta)
Esempio n. 10
0
def fill_entries(data_source):
    """
    Creates the missing entries for a Data Source. Assumes the DataSource
    exists.

    If the missing ids may not be retrieved, return False. Otherwise, return
    True.
    """
    module = sources.SOURCES[data_source.domain]

    # TODO: Offload database queries to an executor.
    existing_ids = db.query(Entry.source_id)\
                     .filter(Entry.data_source == data_source)\
                     .yield_per(10000)
    existing_ids = list(map(lambda r: r[0], existing_ids))

    logger.info("%s existing ids found for '%s'",
                len(existing_ids), data_source.domain)

    try:
        # Use an executor, so we can keep the modules asyncio-agnostic.
        loop = asyncio.get_event_loop()
        future = loop.run_in_executor(
            None, module.get_missing_ids, existing_ids
        )
        missing_ids = yield from future
        missing_ids = list(missing_ids)
    except:
        return False

    logger.info("%s entries for %s need to be created",
                len(missing_ids), data_source.domain)

    # Go around the SQLAlchemy ORM so we avoid loading over 1 million entries
    # on memory when first adding data sources. Also, add them in batches of
    # 100k.
    now = datetime.now()
    step = 100000
    for start in range(0, len(missing_ids), step):
        logger.info("adding batch #%s for %s",
                    int(start / step + 1), data_source.domain)

        end = start + step
        new_entries = []
        for missing_id in missing_ids[start:end]:
            new_entries.append({
                'outcome': 'pending',
                'source_id': missing_id,
                'added': now,
                'number_of_tries': 0,
                'data_source_id': data_source.id
            })
        db.execute(Entry.__table__.insert(), new_entries)
        db.commit()

    return True
Esempio n. 11
0
def delete_embedding(embedding_id):
    embedding = db.query(Embedding).get(embedding_id)
    if not embedding:
        abort(404)

    embedding.clean_up()
    db.delete(embedding)
    db.commit()

    return '', 204
Esempio n. 12
0
def delete_testset(testset_id):
    testset = db.query(TestSet).get(testset_id)
    if not testset:
        abort(404)

    testset.clean_up()
    db.delete(testset)
    db.commit()

    return '', 204
Esempio n. 13
0
def main():
    tusubtitulo = db.query(DataSource)\
                    .filter_by(domain='tusubtitulo.com')\
                    .first()

    if not tusubtitulo:
        tusubtitulo = DataSource(domain='tusubtitulo.com')
        db.add(tusubtitulo)
        db.commit()

    pool = mp.Pool(15)

    shows = get_show_list()
    all_seasons = pool.map(get_show_seasons, shows)

    season_tuples = []
    for show, show_seasons in zip(shows, all_seasons):
        for show_season in show_seasons:
            season_tuples.append((show, show_season))

    results = pool.map(get_season_subtitles, season_tuples)

    # Flatten the results.
    subtitle_ids = []
    for result in results:
        subtitle_ids.extend(result)

    # Fitler `None`s.
    subtitle_ids = list(filter(lambda s: s, subtitle_ids))

    existing = db.query(Entry.source_id).filter_by(data_source=tusubtitulo)
    existing = set(map(lambda r: r[0].split('@@')[1], existing))

    # We don't want repeated entries for the same episode.
    new_entries = []
    for subtitle_id in subtitle_ids:
        if subtitle_id[1] in existing:
            continue
        new_entries.append(subtitle_id)

    if new_entries:
        save_entries(new_entries, tusubtitulo.id)
Esempio n. 14
0
def delete_testing_job(testing_job_id):
    testing_job = db.query(TestingJob).get(testing_job_id)
    if not testing_job:
        abort(404)

    # If it has any result associated, delete it.
    result = db.query(Result).get((
        testing_job.embedding_id,
        testing_job.testset_id
    ))
    if result:
        db.delete(result)

    if testing_job.task_id:
        celery_app.control.revoke(testing_job.task_id, terminate=True)

    db.delete(testing_job)
    db.commit()

    return '', 204
Esempio n. 15
0
def delete_training_job(training_job_id):
    training_job = db.query(TrainingJob).get(training_job_id)
    if not training_job:
        abort(404)

    # Use the embedding's `clean_up` function, which will take care of
    # everything, including the deletion of the training job.
    embedding = training_job.embedding
    embedding.clean_up()

    return '', 204
Esempio n. 16
0
def update_embedding(embedding_id):
    embedding = db.query(Embedding).get(embedding_id)
    if not embedding:
        abort(404)

    data = request.get_json(force=True)
    embedding.description = data['description']
    embedding = db.merge(embedding)
    db.commit()

    return jsonify(data=serialize_embedding(embedding, summary=False))
Esempio n. 17
0
def delete_result(embedding_id, testset_id):
    result = db.query(Result).get((embedding_id, testset_id))
    if not result:
        abort(404)

    # Delete its testing_job first.
    db.delete(result.testing_job)
    db.delete(result)

    db.commit()

    return '', 204
Esempio n. 18
0
def list_results():
    embedding_id = request.args.get('embedding', None)
    testset_id = request.args.get('testset', None)

    # Calculate the ranking of the embedding on the testset.
    partition = func.rank().over(partition_by=Result.testset_id,
                                 order_by=Result.accuracy.desc()).label('rank')
    sq = db.query(Result, partition).subquery()
    query = db.query(sq)

    if embedding_id:
        query = query.filter(sq.c.embedding_id == int(embedding_id))
    if testset_id:
        query = query.filter(sq.c.testset_id == int(testset_id))

    results = query.all()

    data = [serialize_result(res) for res in results]
    meta = {'count': len(data)}

    return jsonify(data=data, meta=meta)
Esempio n. 19
0
def update_testset(testset_id):
    testset = db.query(TestSet).get(testset_id)
    if not testset:
        abort(404)

    data = request.get_json(force=True)
    testset.name = data['name']
    testset.description = data['description']
    testset = db.merge(testset)
    db.commit()

    return jsonify(data=serialize_testset(testset, summary=False))
Esempio n. 20
0
def create_training_job():
    embedding_id = request.get_json(force=True)['embedding_id']
    embedding = db.query(Embedding).get(embedding_id)
    if not embedding:
        abort(404)

    # Check if it has been trained already first.
    training_job = db.query(TrainingJob)\
                     .filter_by(embedding_id=embedding_id).first()
    if training_job:
        message = "The embedding is already trained or being trained."
        return jsonify(error='Bad Request', message=message), 400

    embedding.status = 'TRAINING'
    training_job = TrainingJob(embedding_id=embedding_id)
    db.add(training_job)
    db.commit()

    train.delay(training_job.id)

    return jsonify(data={'training_job_id': training_job.id})
Esempio n. 21
0
def download_testset(testset_id):
    testset = db.query(TestSet).get(testset_id)
    if not testset:
        abort(404)

    def content_streamer():
        with open(testset.full_path, encoding='utf-8') as f:
            for line in f:
                yield line

    response = Response(content_streamer(), mimetype="text/plain")
    content_disposition = "attachment; filename={}".format(testset.file_name)
    response.headers['Content-Disposition'] = content_disposition
    return response
Esempio n. 22
0
def list_training_jobs():
    status = request.args.get('status', None)

    # Failed jobs are included in `queued`, as their `elapsed_time` will be
    # None too.
    query = db.query(TrainingJob)
    if status == 'finished':
        query = query.filter(~TrainingJob.elapsed_time.is_(None))
    elif status == 'queued':
        query = query.filter(TrainingJob.elapsed_time.is_(None))
    training_jobs = query.all()

    data = [serialize_training_job(tj) for tj in training_jobs]
    meta = {'count': len(data)}

    return jsonify(data=data, meta=meta)
Esempio n. 23
0
def main(first_date=FIRST_DATE):
    clarin = db.query(DataSource).filter_by(domain='clarin.com').first()
    if not clarin:
        clarin = DataSource(domain='clarin.com')
        db.merge(clarin)
        db.commit()

    day_count = (date.today() - first_date).days

    for current_day in range(day_count):
        day = first_date + timedelta(days=current_day)
        print("day: {}".format(day))

        page_number = 1
        day_ids = []
        while True:
            url = BASE_URL.format(str(day).replace('-', ''), page_number)
            response = requests.get(url)

            # Remove beginning and ending parentheses.
            if not response.text:
                print("error; sleeping...")
                time.sleep(60)
                continue

            page = json.loads(response.text[1:-1])
            if not page['news']:
                break

            # Get the IDs for each link on the history page.
            root = html.fromstring(page['news'])
            links = [
                el.get('href')
                for el in root.xpath('//li[@class="item"]/a[@href]')
            ]
            day_ids.extend(
                [re.sub(r'.*_(\d+)\.html', r'\1', l) for l in links])

            if not page.get('moreContents'):
                break
            page_number += 1

        if day_ids:
            save_entries(day_ids, clarin.id)
Esempio n. 24
0
def scrape_entry(entry_id):
    """
    Scrapes the Entry identified by `entry_id` and updates its info, also
    storing the document in Elasticsearch if successful.
    """
    entry = db.query(Entry).get(entry_id)
    module = sources.SOURCES[entry.data_source.domain]

    # Fetch the entry's content.
    # `source_id` may be composite, separating parts with `@@`.
    source_id = entry.source_id.split('@@')
    url = module.DOCUMENT_URL.format(*source_id)

    headers = settings.REQUEST_HEADERS
    source_headers = getattr(module, 'HEADERS', None)
    if source_headers:
        headers = headers.copy()
        headers.update(source_headers)

    try:
        response = yield from get(url, headers=headers)
    except Exception as e:
        # Capture all exceptions, as the `requests` library may raise
        # arbitrary exceptions; not all of them are wrapped.
        logger.info("entry_id = %s failed when requesting url; %s",
                    entry_id, repr(e))
        entry.outcome = 'failure'
        entry.last_tried = datetime.now()
        entry.number_of_tries += 1
        db.merge(entry)
        db.commit()
        return

    # TODO: Improve error handling; code may fail silently.
    try:
        content = module.get_content(response)
    except Exception as e:
        logger.info("entry_id = %s failed when getting content; %s",
                    entry_id, repr(e))
        entry.outcome = 'failure'
        entry.last_tried = datetime.now()
        entry.number_of_tries += 1
        db.merge(entry)
        db.commit()
        return

    if content['outcome'] == 'success':
        min_words = settings.MIN_WORDS_PER_DOCUMENT
        word_count = len(content['content'].split())
        if not content['content'] or word_count < min_words:
            # Parsing was marked as successful, but no (or too little) content
            # returned; mark as unparseable instead.
            content['outcome'] = 'unparseable'

    outcome = content['outcome']
    entry.outcome = outcome
    entry.last_tried = datetime.now()
    entry.number_of_tries += 1
    db.merge(entry)

    if outcome not in ['multiple', 'success', 'more_entries']:
        # Finished already.
        logger.info("entry_id = %s finished with outcome = %s",
                    entry_id, outcome)
        db.commit()
        return

    # The `multiple` case returns a dict like this:
    # {'outcome': 'multiple', 'new_entries': [...], 'documents': [...]}
    if outcome in ['more_entries', 'multiple']:
        # Create new entries, only if not needed.
        new_ids = content['new_entries']
        if new_ids:
            existing = db.query(Entry.source_id)\
                        .filter(Entry.source_id.in_(new_ids))
            existing = set(map(lambda r: r[0], existing))
            missing = set(new_ids) - existing

            if missing:
                now = datetime.now()
                new_entries = []
                for new_id in missing:
                    new_entries.append({
                        'outcome': 'pending',
                        'source_id': new_id,
                        'added': now,
                        'number_of_tries': 0,
                        'data_source_id': entry.data_source.id,
                    })
                db.execute(Entry.__table__.insert(), new_entries)
        elif outcome == 'more_entries':
            logger.warning(
                "entry_id = %s (outcome = %s) returned no additional entries",
                entry_id, outcome
            )

    # If successful, fetch the metadata of the entry and store in
    # Elasticsearch.
    if outcome in ['multiple', 'success']:
        # `get_metadata` must return the same number of documents as
        # `get_content`.
        metadata = module.get_metadata(response)
        if isinstance(metadata, list):
            for md in metadata:
                md['url'] = response.url
        else:
            metadata['url'] = response.url

        if outcome == 'success':
            results = [content]
            metadatas = [metadata]
        else:
            results = content['documents']
            metadatas = metadata

        new_docs = []
        for content, metadata in zip(results, metadatas):
            min_words = settings.MIN_WORDS_PER_DOCUMENT
            word_count = len(content['content'].split())
            if not content['content'] or word_count < min_words:
                continue

            doc_id, doc = prepare_document(content, metadata, entry)
            new_docs.append((doc_id, doc))

    logger.info("entry_id = %s finished with outcome = %s", entry_id, outcome)
    db.commit()

    # Finally, store document on Elasticsearch too.
    for doc_id, doc in new_docs:
        es.index(
            index=settings.ES_INDEX,
            doc_type=settings.ES_DOCTYPE,
            id=doc_id,
            body=doc
        )
Esempio n. 25
0
def scrape_entries(data_source_id):
    """
    Coroutine tasked with orchestrating scraping for a certain source.

    Will first check for new entries to be created; then will get the list of
    all pending ones, scrape them and start over. If the process is finished
    too fast, will sleep for a while.
    """
    loop = asyncio.get_event_loop()
    data_source = db.query(DataSource).get(data_source_id)

    while True:
        loop_start = loop.time()

        # Create the new entries.
        logger.info("populating entries for '%s'", data_source.domain)
        success = yield from fill_entries(data_source)

        if not success:
            logger.info("error populating '%s'", data_source.domain)
            yield from asyncio.sleep(settings.LOOP_COOLDOWN)
            continue

        logger.info("all entries created for '%s'", data_source.domain)

        # See which entries need to be scraped, checking their status and
        # retries.
        statuses = ['failure', 'pending']
        entries_left = db.query(Entry.id).filter(
            Entry.outcome.in_(statuses),
            Entry.number_of_tries < settings.MAX_RETRIES,
            Entry.data_source == data_source
        ).yield_per(50000)

        entries_left = list(map(lambda e: e[0], entries_left))
        logger.info(
            "%s entries to scrape found for '%s'",
            len(entries_left), data_source.domain
        )

        # Perform the scraping.
        current_tasks = set()
        while entries_left and len(current_tasks) < data_source.concurrency:
            entry_id = entries_left.pop()
            task = loop.create_task(scrape_entry(entry_id))
            current_tasks.add(task)

            # If the concurrency limit for the data source is reached, wait
            # until at least one of them finishes.
            if len(current_tasks) >= data_source.concurrency:
                done, pending = yield from asyncio.wait(
                    current_tasks,
                    return_when=concurrent.futures.FIRST_COMPLETED
                )
                current_tasks = pending
            else:
                # No need to wait for entries; wait for a cooldown anyways.
                yield from asyncio.sleep(settings.REQUEST_COOLDOWN)

        loop_end = loop.time()
        # If the loop took less than the default cooldown time, sleep the rest
        # of the time to avoid busy-waiting.
        if loop_end - loop_start < settings.LOOP_COOLDOWN:
            wait_time = int(settings.LOOP_COOLDOWN - (loop_end - loop_start))
            logger.info(
                "finished the loop too fast for '%s', sleeping %ss",
                data_source.domain, wait_time
            )
            yield from asyncio.sleep(wait_time)
Esempio n. 26
0
def create_testing_job():
    data = request.get_json(force=True)
    if 'embedding_id' not in data or 'testset_id' not in data:
        abort(400)

    embedding_id = data['embedding_id']
    testset_id = data['testset_id']

    if not (isinstance(embedding_id, int) or isinstance(testset_id, int)):
        return jsonify({
            'message': "At least one ID must be specified",
            'error': 'Bad Request'
        }), 400

    # Build a list of embeddings and testsets to test.
    embeddings = []
    testsets = []
    if isinstance(embedding_id, int):
        embedding = db.query(Embedding).get(embedding_id)
        if not embedding:
            abort(404)
        embeddings.append(embedding)

        if isinstance(testset_id, int):
            testset = db.query(TestSet).get(testset_id)
            testsets.append(testset)
        elif testset_id == 'full':
            testsets.extend(db.query(TestSet).all())
        elif testset_id == 'missing':
            existing = db.query(TestSet.id).join(Result).join(Embedding)\
                         .filter(Embedding.id == embedding_id)
            query = db.query(TestSet).filter(~TestSet.id.in_(existing))
            testsets.extend(query.all())

    elif isinstance(testset_id, int):
        testset = db.query(TestSet).get(testset_id)
        if not testset:
            abort(404)
        testsets.append(testset)

        if isinstance(embedding_id, int):
            embedding = db.query(Embedding).get(embedding_id)
            embeddings.append(embedding)
        elif embedding_id == 'full':
            embeddings.extend(db.query(Embedding).all())
        elif embedding_id == 'missing':
            existing = db.query(Embedding.id).join(Result).join(TestSet)\
                         .filter(TestSet.id == testset_id)
            query = db.query(Embedding).filter(~Embedding.id.in_(existing))
            embeddings.extend(query.all())

    # Make sure there are no Nones (i.e. all the models exist).
    if any([emb is None for emb in embeddings]):
        abort(404)
    if any([ts is None for ts in testsets]):
        abort(404)

    # Make sure the embeddings are trained already.
    embeddings = filter(lambda e: e.status == 'TRAINED', embeddings)

    # For each pair <embedding, testset>, create the necessary TestingJob,
    # deleting it first if it already exists. Also delete associated results.
    jobs = []
    for embedding in embeddings:
        for testset in testsets:
            job = db.query(TestingJob)\
                    .filter_by(embedding=embedding, testset=testset)\
                    .first()
            if job and job.status in ['PENDING', 'PROGRESS']:
                # Only overwrite TestingJobs that have already run. If it's
                # still pending or running right now, we want to keep it.
                continue
            elif job:
                for result in job.results.all():
                    db.delete(result)
                db.delete(job)

            job = TestingJob(testset=testset, embedding=embedding)
            jobs.append(job)
            db.add(job)

    db.commit()

    for job in jobs:
        test.delay(job.id)

    return jsonify(data={'testing_job_id': [job.id for job in jobs]})
Esempio n. 27
0
def view_training_job(training_job_id):
    training_job = db.query(TrainingJob).get(training_job_id)
    if not training_job:
        abort(404)
    return jsonify(data=serialize_training_job(training_job))
Esempio n. 28
0
def view_embedding(embedding_id):
    embedding = db.query(Embedding).get(embedding_id)
    if not embedding:
        abort(404)
    return jsonify(data=serialize_embedding(embedding, summary=False))
Esempio n. 29
0
def view_testset(testset_id):
    testset = db.query(TestSet).get(testset_id)
    if not testset:
        abort(404)
    return jsonify(data=serialize_testset(testset, summary=False))