Ejemplo n.º 1
0
 def __init__(self, url, include_text='', full_text=False, connections=None):
     self.url = url
     self.full_text = full_text
     self.include_text = include_text
     self.source = detect_news_source(url)
     self.connections = connections
     self.count = 0
Ejemplo n.º 2
0
 def _data_produce(self, items):
     items = data_inserter(self.include_text, "keyword", items)
     if (__debug__) and 0 == len(items) and 'any' == self.source:
         print("please debug this source: {} | {}".format(
             self.url, detect_news_source(self.url)))
     items = data_hasher("hash", ["title", "published", "source"], items)
     return items
Ejemplo n.º 3
0
async def archive_feed_by_filter(url,
                                 include_text,
                                 ap=None,
                                 osp=None,
                                 connections=None):
    from newsfeed.filter import NewsFeedFilter
    if not ap:
        from db.providers import ArchiveProvider
        ap = ArchiveProvider()
    if not osp:
        from db.providers import ObserverStatProvider
        osp = ObserverStatProvider()

    nff = NewsFeedFilter(url,
                         include_text,
                         full_text=True,
                         connections=connections)
    items = await nff.as_output()
    count = nff.feedCount()
    total = len(items)
    # checking duplicate items by hash
    items = await ap.as_find_distinct_items_by("hash", items)
    ids = list(await ap.as_save_all(items))
    acceptances = len(ids)
    rejects = total - acceptances

    await osp.as_save({
        'count': count,
        'total': total,
        'acceptances': acceptances,
        'rejects': rejects
    })

    return dict_cleaner(
        None, {
            'source':
            detect_news_source(url),
            'url':
            url,
            'include':
            include_text,
            'count':
            count,
            'total':
            total,
            'acceptances':
            acceptances,
            'rejects':
            rejects,
            'items':
            ids,
            'info':
            '(%d/%d)' % (acceptances, total),
            'infomation':
            '(%d/%d) %d successfully created, %d duplicates found.' %
            (acceptances, total, acceptances, rejects)
        })
Ejemplo n.º 4
0
 def __init__(self, url, html, source=None):
     self.data = {}
     self.soup = BeautifulSoup(html, "html.parser")
     self.url = normalize_link(url)
     self.html = html
     if not source:
         self.source = detect_news_source(self.url)
     else:
         self.source = source
     self.context = load_context(self.source)
     self.trimtext = load_trimtext(self.source)
     self.dummy = {'pass': False, 'link': '', 'source': 'any'}
Ejemplo n.º 5
0
def fetch_news_all(urls, encoding='utf-8', timeout=60, limit=5, remedy=0, source=None):
    from concurrent.futures import ThreadPoolExecutor
    from requests_futures.sessions import FuturesSession
    import threading
    sem = threading.Semaphore(limit)
    collect = []
    resopones = []

    with FuturesSession(session=requests.Session(),
                        executor=ThreadPoolExecutor(max_workers=os.cpu_count())) as session:
        connection = 0
        failed_urls = []
        futures = ((url, session.get(url, timeout=timeout)) for url in urls)
        for url, future in futures:
            connection = connection + 1
            target_source = None
            if not source:
                source = detect_news_source(url)
            else:
                url_source = detect_news_source(url)
            if 'youtube' == url_source:
                target_source = source
            else:
                target_source = url_source

            if __debug__:
                if 'any' == target_source:
                    print(f"[*skip*:{connection}] ({url})")
                else:
                    print(f"[{target_source}:{connection}] ({url})")

            if remedy:
                log.error(f"[{__name__}] Retry: {url}")

            try:
                with sem:
                    if target_source != 'any':
                        resopones.append((target_source, future.result()))
                    else:
                        resopones.append((target_source, None))
            except requests.exceptions.RequestException as e:
                failed_urls.append(url)
                log.error(f"[{__name__}] Failure when trying to fetch {url}")
                log.info(e, exc_info=True)
                continue

        for (target_source, resp) in resopones:
            if resp:
                resp.encoding = encoding
                html = clean_html(resp.text)
                news = NewsDataProcessor(resp.url, html, target_source)
                output = news.output()
            else:
                output = {}
            collect.append(output)

    if failed_urls and remedy < limit:
        remedy = remedy + 1
        return collect + fetch_news_all(failed_urls, encoding, timeout, limit, True, source)
    else:
        return collect
Ejemplo n.º 6
0
 def test_detect_news_source(self):
     for source, urls in self.urls.items():
         for url in urls:
             self.assertEqual(detect_news_source(url), source)