Esempio n. 1
0
def recrawl() -> None:
    """
    Recrawl outdated sources.
    Outdated is defined as:
        * sources that weren't crawled yet
        * sources successfully crawled over 2 hours ago
        * sources whose crawling task was started over 8 hours ago
        * sources whose last crawl failed
    """
    _LOGGER.info("Starting recrawl...")
    outdated_sql = text('''
        SELECT * FROM (
            SELECT DISTINCT ON(src.id)
                src.id, cl.state, cl.timestamp
            FROM activity.source AS src
            JOIN activity.source_feature as src_feature ON src_feature.source_id = src.id
            LEFT OUTER JOIN activity.crawllog AS cl ON cl.source_id = src.id
            WHERE src_feature.feature = 'auto_crawl'
            ORDER BY src.id, cl.timestamp DESC
        ) AS most_recent_per_source
        WHERE timestamp IS NULL
            OR (state = 'START' AND timestamp < now() - '8 hours'::INTERVAL) -- timeout
            OR (state = 'DONE' AND timestamp < now() - '2 hours'::INTERVAL) -- normal schedule
            OR state = 'FAIL' ''')
    with get_session() as session:
        outdated = list(session.execute(outdated_sql))
        _LOGGER.info("Recrawling outdated sources: %s", str(outdated))
        crawl_group = group(
            crawl.s(source_id)
            for (source_id, last_state, last_timestamp) in outdated)
        crawl_group()
        _LOGGER.info("... Done recrawl")
Esempio n. 2
0
def scrape_meta_for_url(url: str) -> Tuple[int, Dict[str, Optional[str]]]:
    """
    scrape all relevant meta data (facebook opengraph, twitter, etc.) for the url
    :param url: url to scrape from
    """
    html_doc = fetch_url_content(url)
    soup = BeautifulSoup(html_doc, 'html.parser')

    tags: Dict[str, Optional[str]] = dict(
        orig_source=find_or_none(soup, 'link', 'href', rel='original-source'),
        description=find_or_none(soup, 'meta', 'content', name='description'),
        canonical=find_or_none(soup, 'link', 'href', rel='canonical'))

    twitter_tags = dict(
        (tag, find_or_none(soup, 'meta', 'content', name='twitter:%s' % tag))
        for tag in _TWITTER_TAGS)
    tags.update(twitter_tags)

    og_tags = dict(
        (tag, find_or_none(soup, 'meta', 'content', property='og:%s' % tag))
        for tag in _OG_TAGS)
    tags.update(og_tags)
    with get_session() as session:
        result = insert_or_ignore(session, Shortener(**tags))
        session.commit()
        insert_id = result.inserted_primary_key[
            0] if result.inserted_primary_key else None
    return insert_id, tags
Esempio n. 3
0
def _select_retrain_model_params() -> Iterable[_RetrainModels]:
    """
    Select all models in need for retraining.
    :return: a list of model params to retrain
    """
    with get_session() as session:
        return (_RetrainModels(**row) for row in session.execute(_RETRAIN_MODELS_SQL))
Esempio n. 4
0
def _train_model(user_id: int,
                 tagset_id: int,
                 source_ids: Iterable[int],
                 n_estimators: int,
                 _params: Optional[Dict[str, Any]],
                 _score: float,
                 progress: Optional[ProgressCallback] = None) -> str:
    """
    :param user_id: the creating user id
    :param tagset_id: the tagset id
    :param source_ids: the source ids
    :param n_estimators: how many estimators to use for the estimator bag
    :param progress: an optional progress callback to update state
    :param _params: don't search for params and use provided
    :param _score: provide a score for fast training mode
    :return: stringified model id
    """
    assert tagset_id and source_ids
    if not 0 < n_estimators <= 1000:
        raise ValueError('invalid estimator count: %d' % n_estimators)
    session: Session
    with get_session() as session:
        tagset = session.query(TagSet).get(tagset_id)
        sources = session.query(Source).filter(Source.id.in_(tuple(source_ids))).all()
        user = session.query(User).get(user_id)
        factory = LensTrainer(user, tagset, sources, progress=progress)
        lens = factory.train(n_estimators=n_estimators, _params=_params, _score=_score)
        return str(factory.persist(lens, session))
Esempio n. 5
0
 def _fetch_samples(self, num_truths: int, num_samples: int) -> Iterable[Tuple[int, Sample]]:
     _LOGGER.debug("fetching sampleset with %d truths and %d samples...", num_truths, num_samples)
     with get_session() as session:
         query = session.execute(_RANDOM_SAMPLE_SQL,
                                 dict(tagset_id=self._tagset.id,
                                      num_samples=num_samples,
                                      num_truths=num_truths,
                                      langs=tuple(['en', 'de']),  # todo find better way to filter for translations
                                      sources=tuple(source.id for source in self._sources)))
         for _id, tag_id, _language, message, created_time, fingerprint, translation in query:
             yield tag_id or -1, Sample(translation or message, fingerprint, created_time)
     _LOGGER.debug("... done fetching sampleset")
Esempio n. 6
0
 def process_item(self, item: CrawlItem, spider: GenericMixin) -> Item:
     """
     Process a single `CrawlItem` if there is no api_key present in the spider
     :param item: the crawled item
     :param spider: the spider
     :return: the item for further processing
     """
     if not isinstance(item, CrawlItem) or spider.api_key is not None:
         return item
     with get_session() as session:
         self.insert_item_db(session, item)
         session.commit()
Esempio n. 7
0
 def process_item(self, bulk: CrawlBulk, spider: GenericMixin) -> Item:
     """
     Process a `CrawlBulk` if there is no api_key present in the spider
     :param bulk: the bulk of crawled items
     :param spider: the spider
     :return: the item for further processing
     """
     if not isinstance(bulk, CrawlBulk) or spider.api_key is not None:
         return bulk
     with get_session() as session:
         for item in bulk['bulk']:
             DBPipeline.insert_item_db(session, item)
         session.commit()
Esempio n. 8
0
def best_model_for_source_by_id(tagset_id: int, source_id: int) -> Optional[uuid.UUID]:
    """
    Select the best model for the `TagSet` / `Source` combination
    :param tagset_id: id of the `TagSet`
    :param source_id: id of the `Source`
    :return: id of best model
    """
    with get_session() as session:
        source = session.query(Source).get(source_id)
        assert source
        model = source.models.filter_by(tagset_id=tagset_id).order_by(Model.score, Model.trained_ts).first()
        if not model:
            model = session.query(Model).filter_by(tagset_id=tagset_id).order_by(Model.score, Model.trained_ts).first()
        return model.id if model else None
Esempio n. 9
0
def maintenance() -> None:
    """
    Run maintenance job for brain: clean up db entries/model ids no longer in use
    """
    _LOGGER.info("Beginning Brain maintenance...")
    (_, _, file_id_list) = next(os.walk(MODEL_FILE_ROOT))
    file_ids = set(file_id_list)
    session: Session
    with get_session() as session:
        db_ids = set([str(model_id) for (model_id,) in session.query(Model.id)])
        missing_ids = db_ids.difference(file_ids)
        _LOGGER.warning('The following model ids are missing the model file: %s', missing_ids)
        delete_ids = file_ids.difference(db_ids)
        _LOGGER.warning('The following model ids are orphaned and will be deleted: %s', delete_ids)
        for delete_id in delete_ids:
            os.remove(model_file_path(delete_id))
    _LOGGER.info("... Done Brain maintenance")
Esempio n. 10
0
def crawl(source_id: int) -> None:
    """
    Crawl the provided `Source`
    :param source_id: id of the `Source`
    """
    _LOGGER.info("Crawling source: %d...", source_id)
    with get_session() as session:
        session.add(CrawlLog(source_id=source_id, state=CrawlState.START))
        session.commit()
        try:
            process = facebook_crawler_process(source_id, -60)
            process.start()
            session.add(CrawlLog(source_id=source_id, state=CrawlState.DONE))
            session.commit()
        except Exception:  # pylint: disable=broad-except
            session.add(CrawlLog(source_id=source_id, state=CrawlState.FAIL))
            session.commit()
    _LOGGER.info("... Done crawling source: %d", source_id)
Esempio n. 11
0
 def __init__(self,
              source_id: int,
              since: Optional[TSince] = None,
              api_key: Optional[str] = None) -> None:
     """
     :param source_id: source id for this spider
     :param since: since parameter for this spider
     :param api_key: api key for this spider. will be deprecated
     """
     assert source_id is not None
     self._api_key = api_key
     if self._api_key is None:
         with get_session() as session:
             source = session.query(Source).get(source_id)
             self._source = dict(id=source.id,
                                 type=source.type,
                                 uri=source.uri,
                                 slug=source.slug)
     else:
         request_headers = headers(self._api_key)
         self._source = requests.get('%s/sources/%s' %
                                     (BASE_PATH, source_id),
                                     headers=request_headers).json()
     assert self._source
     if isinstance(since, datetime):
         self._since = since
     elif isinstance(since, str):
         self._since = dateutil.parser.parse(since)
     elif isinstance(since, int) or since is None:
         if since is None:
             since = -14  # default -14 days
         try:
             since_int = int(since)
             if since_int < 0:
                 self._since = datetime.utcnow() - timedelta(
                     days=-since_int)
             else:
                 self._since = datetime.utcfromtimestamp(since_int)
         except ValueError:
             self._since = datetime.utcnow()
     else:
         raise ValueError('Provided since parameter not acceptable')
Esempio n. 12
0
    def load_from_id(cls, model_id: uuid.UUID) -> 'Lens':
        """
        Load a model from the specified id.
        :param model_id: id of the model
        :return: a `Lens` instance loaded from the system
        :raises RuntimeError: if the model could not be loaded
        """
        with get_session() as session:
            model: Model = session.query(Model).get(model_id)

        estimator_bag: TEstimatorBag = []
        try:
            with open(model_file_path(model_id), 'rb') as model_file:
                estimator_bag = pickle.load(model_file)
        except FileNotFoundError:
            _LOGGER.exception('Could not load model %s', str(model_id))

        if not model or not estimator_bag:
            raise RuntimeError("Could not load model %s", str(model_id))

        return Lens(model, estimator_bag)