def recrawl() -> None: """ Recrawl outdated sources. Outdated is defined as: * sources that weren't crawled yet * sources successfully crawled over 2 hours ago * sources whose crawling task was started over 8 hours ago * sources whose last crawl failed """ _LOGGER.info("Starting recrawl...") outdated_sql = text(''' SELECT * FROM ( SELECT DISTINCT ON(src.id) src.id, cl.state, cl.timestamp FROM activity.source AS src JOIN activity.source_feature as src_feature ON src_feature.source_id = src.id LEFT OUTER JOIN activity.crawllog AS cl ON cl.source_id = src.id WHERE src_feature.feature = 'auto_crawl' ORDER BY src.id, cl.timestamp DESC ) AS most_recent_per_source WHERE timestamp IS NULL OR (state = 'START' AND timestamp < now() - '8 hours'::INTERVAL) -- timeout OR (state = 'DONE' AND timestamp < now() - '2 hours'::INTERVAL) -- normal schedule OR state = 'FAIL' ''') with get_session() as session: outdated = list(session.execute(outdated_sql)) _LOGGER.info("Recrawling outdated sources: %s", str(outdated)) crawl_group = group( crawl.s(source_id) for (source_id, last_state, last_timestamp) in outdated) crawl_group() _LOGGER.info("... Done recrawl")
def scrape_meta_for_url(url: str) -> Tuple[int, Dict[str, Optional[str]]]: """ scrape all relevant meta data (facebook opengraph, twitter, etc.) for the url :param url: url to scrape from """ html_doc = fetch_url_content(url) soup = BeautifulSoup(html_doc, 'html.parser') tags: Dict[str, Optional[str]] = dict( orig_source=find_or_none(soup, 'link', 'href', rel='original-source'), description=find_or_none(soup, 'meta', 'content', name='description'), canonical=find_or_none(soup, 'link', 'href', rel='canonical')) twitter_tags = dict( (tag, find_or_none(soup, 'meta', 'content', name='twitter:%s' % tag)) for tag in _TWITTER_TAGS) tags.update(twitter_tags) og_tags = dict( (tag, find_or_none(soup, 'meta', 'content', property='og:%s' % tag)) for tag in _OG_TAGS) tags.update(og_tags) with get_session() as session: result = insert_or_ignore(session, Shortener(**tags)) session.commit() insert_id = result.inserted_primary_key[ 0] if result.inserted_primary_key else None return insert_id, tags
def _select_retrain_model_params() -> Iterable[_RetrainModels]: """ Select all models in need for retraining. :return: a list of model params to retrain """ with get_session() as session: return (_RetrainModels(**row) for row in session.execute(_RETRAIN_MODELS_SQL))
def _train_model(user_id: int, tagset_id: int, source_ids: Iterable[int], n_estimators: int, _params: Optional[Dict[str, Any]], _score: float, progress: Optional[ProgressCallback] = None) -> str: """ :param user_id: the creating user id :param tagset_id: the tagset id :param source_ids: the source ids :param n_estimators: how many estimators to use for the estimator bag :param progress: an optional progress callback to update state :param _params: don't search for params and use provided :param _score: provide a score for fast training mode :return: stringified model id """ assert tagset_id and source_ids if not 0 < n_estimators <= 1000: raise ValueError('invalid estimator count: %d' % n_estimators) session: Session with get_session() as session: tagset = session.query(TagSet).get(tagset_id) sources = session.query(Source).filter(Source.id.in_(tuple(source_ids))).all() user = session.query(User).get(user_id) factory = LensTrainer(user, tagset, sources, progress=progress) lens = factory.train(n_estimators=n_estimators, _params=_params, _score=_score) return str(factory.persist(lens, session))
def _fetch_samples(self, num_truths: int, num_samples: int) -> Iterable[Tuple[int, Sample]]: _LOGGER.debug("fetching sampleset with %d truths and %d samples...", num_truths, num_samples) with get_session() as session: query = session.execute(_RANDOM_SAMPLE_SQL, dict(tagset_id=self._tagset.id, num_samples=num_samples, num_truths=num_truths, langs=tuple(['en', 'de']), # todo find better way to filter for translations sources=tuple(source.id for source in self._sources))) for _id, tag_id, _language, message, created_time, fingerprint, translation in query: yield tag_id or -1, Sample(translation or message, fingerprint, created_time) _LOGGER.debug("... done fetching sampleset")
def process_item(self, item: CrawlItem, spider: GenericMixin) -> Item: """ Process a single `CrawlItem` if there is no api_key present in the spider :param item: the crawled item :param spider: the spider :return: the item for further processing """ if not isinstance(item, CrawlItem) or spider.api_key is not None: return item with get_session() as session: self.insert_item_db(session, item) session.commit()
def process_item(self, bulk: CrawlBulk, spider: GenericMixin) -> Item: """ Process a `CrawlBulk` if there is no api_key present in the spider :param bulk: the bulk of crawled items :param spider: the spider :return: the item for further processing """ if not isinstance(bulk, CrawlBulk) or spider.api_key is not None: return bulk with get_session() as session: for item in bulk['bulk']: DBPipeline.insert_item_db(session, item) session.commit()
def best_model_for_source_by_id(tagset_id: int, source_id: int) -> Optional[uuid.UUID]: """ Select the best model for the `TagSet` / `Source` combination :param tagset_id: id of the `TagSet` :param source_id: id of the `Source` :return: id of best model """ with get_session() as session: source = session.query(Source).get(source_id) assert source model = source.models.filter_by(tagset_id=tagset_id).order_by(Model.score, Model.trained_ts).first() if not model: model = session.query(Model).filter_by(tagset_id=tagset_id).order_by(Model.score, Model.trained_ts).first() return model.id if model else None
def maintenance() -> None: """ Run maintenance job for brain: clean up db entries/model ids no longer in use """ _LOGGER.info("Beginning Brain maintenance...") (_, _, file_id_list) = next(os.walk(MODEL_FILE_ROOT)) file_ids = set(file_id_list) session: Session with get_session() as session: db_ids = set([str(model_id) for (model_id,) in session.query(Model.id)]) missing_ids = db_ids.difference(file_ids) _LOGGER.warning('The following model ids are missing the model file: %s', missing_ids) delete_ids = file_ids.difference(db_ids) _LOGGER.warning('The following model ids are orphaned and will be deleted: %s', delete_ids) for delete_id in delete_ids: os.remove(model_file_path(delete_id)) _LOGGER.info("... Done Brain maintenance")
def crawl(source_id: int) -> None: """ Crawl the provided `Source` :param source_id: id of the `Source` """ _LOGGER.info("Crawling source: %d...", source_id) with get_session() as session: session.add(CrawlLog(source_id=source_id, state=CrawlState.START)) session.commit() try: process = facebook_crawler_process(source_id, -60) process.start() session.add(CrawlLog(source_id=source_id, state=CrawlState.DONE)) session.commit() except Exception: # pylint: disable=broad-except session.add(CrawlLog(source_id=source_id, state=CrawlState.FAIL)) session.commit() _LOGGER.info("... Done crawling source: %d", source_id)
def __init__(self, source_id: int, since: Optional[TSince] = None, api_key: Optional[str] = None) -> None: """ :param source_id: source id for this spider :param since: since parameter for this spider :param api_key: api key for this spider. will be deprecated """ assert source_id is not None self._api_key = api_key if self._api_key is None: with get_session() as session: source = session.query(Source).get(source_id) self._source = dict(id=source.id, type=source.type, uri=source.uri, slug=source.slug) else: request_headers = headers(self._api_key) self._source = requests.get('%s/sources/%s' % (BASE_PATH, source_id), headers=request_headers).json() assert self._source if isinstance(since, datetime): self._since = since elif isinstance(since, str): self._since = dateutil.parser.parse(since) elif isinstance(since, int) or since is None: if since is None: since = -14 # default -14 days try: since_int = int(since) if since_int < 0: self._since = datetime.utcnow() - timedelta( days=-since_int) else: self._since = datetime.utcfromtimestamp(since_int) except ValueError: self._since = datetime.utcnow() else: raise ValueError('Provided since parameter not acceptable')
def load_from_id(cls, model_id: uuid.UUID) -> 'Lens': """ Load a model from the specified id. :param model_id: id of the model :return: a `Lens` instance loaded from the system :raises RuntimeError: if the model could not be loaded """ with get_session() as session: model: Model = session.query(Model).get(model_id) estimator_bag: TEstimatorBag = [] try: with open(model_file_path(model_id), 'rb') as model_file: estimator_bag = pickle.load(model_file) except FileNotFoundError: _LOGGER.exception('Could not load model %s', str(model_id)) if not model or not estimator_bag: raise RuntimeError("Could not load model %s", str(model_id)) return Lens(model, estimator_bag)