Beispiel #1
0
def QuickSearchUrls(
    query: Text,
    from_date: datetime.date,
    to_date: datetime.date = None,
    lang: Text = 'en',
    country: Text = 'usa',
    number_of_results: int = 100,
) -> Iterable[Text]:
  """Performs a Google news search using a query.

  Args:
    query: search query.
    from_date: search news from this date.
    to_date: search news until this date, default to today.
    lang: search news in this language.
    country: search news in this country.
    number_of_results: number of results to return.

  Returns:
    A list of searched result URLs.
  """
  if not to_date:
    to_date = datetime.date.today()
  tbs = googlesearch.get_tbs(from_date, to_date)
  return googlesearch.search(
    query, lang=lang, tbs=tbs, country=country, tpe="nws",
    stop=number_of_results)
Beispiel #2
0
 def scrape(self, job, number_of_urls=10):
     query, from_date, to_date = job.values()
     urls = []
     for d in pd.date_range(from_date, to_date):
         tbs = get_tbs(from_date=d, to_date=d)  #"%Y-%m-%d"
         results = search(query, tbs=tbs, pause=2, stop=number_of_urls)
         for url in results:
             urls.append({"date": d.date(), "url": url})
     return pd.DataFrame(urls, columns=["date", "url"])
def crawl_denied_news(start, end, context_word='', interval=10, date_format='%Y%m%d', use_api=False, callback=None):
    # Input = (start:date(string), end:date(string))
    # Output = DataFrame(id, title, link, snippet, date)
    start_date = datetime.strptime(start, date_format)
    denied_news = pd.DataFrame(columns=['d_id', 'd_title', 'd_link', 'd_snippet', 'd_date'])
    while start_date <= datetime.strptime(end, date_format):
        end_interval = start_date + timedelta(days=interval)
        print('From {}, Until {}'.format(start_date.strftime('%Y-%m-%d'), end_interval.strftime('%Y-%m-%d')))
        if use_api:
            sort = 'date:r:{}:{}'.format(start_date.strftime(date_format), end_interval.strftime(date_format))
            results = mine_links_api(CRAWLER_QUERY_WORDS.format(context_word), num=10, sort=sort, gl='ir',
                                     exactTerms=EXACT_TERM)
            if 'items' in results.keys():
                for res in results['items']:
                    item = {
                        'd_id': uuid.uuid1().hex,
                        'd_title': '-' if not res['title'] else res['title'],
                        'd_link': res['link'],
                        'd_snippet': '-' if not res['snippet'] else res['snippet'],
                        'd_date': (start_date + timedelta(days=interval / 2)).strftime('%Y%m%d') if not 'date' in
                                                                                                        res['pagemap'][
                                                                                                            'metatags'][
                                                                                                            0].keys() else ''.join(
                            res['pagemap']['metatags'][0]['date'].split('-'))[:8]
                    }
                    if callback:
                        callback(item)
                    denied_news = denied_news.append(item, ignore_index=True)
        else:
            results = mine_google_links(CRAWLER_QUERY_WORDS.format(context_word), num=30, stop=30,
                                        pause=random.randint(2, 8),
                                        tbs=google.get_tbs(start_date, end_interval + timedelta(days=interval)))
            for (title, link, snippet, date) in results:
                if any(dword in title for dword in DENIAL_WORDS):
                    title = ' '.join(title.split())
                    snippet = ' '.join(snippet.split())
                    if date != '':
                        date = ' '.join(date.split())
                        date = cleaner.date_persian2english(date, delimiter=' ', persian_month=True)
                    item = {
                        'd_id': uuid.uuid1().hex,
                        'd_title': '-' if not title else title,
                        'd_link': link,
                        'd_snippet': '-' if not snippet else snippet,
                        'd_date': date}
                    if callback:
                        callback(item)
                    denied_news = denied_news.append(item, ignore_index=True)
        start_date = end_interval + timedelta(hours=12)
    denied_news.to_excel("denied_news.xlsx", index_label=False, index=False)
    return denied_news
Beispiel #4
0
def download_articles(search_term, n_articles, start, end=None):
    start_date = datetime.datetime.strptime(start, "%Y-%m-%d")
    end_date = start_date if end is None else datetime.datetime.strptime(
        end, "%Y-%m-%d")
    tbs = get_tbs(start_date, end_date)

    urls = find_urls(search_term, tbs, 10)
    valid_articles = []
    while (len(valid_articles) < n_articles and len(urls) > 0):
        articles_left = 5 - len(valid_articles)
        articles = NewsPlease.from_urls(urls[:articles_left])
        empty, articles = detect_empty_articles(articles)
        for new in articles:
            valid_articles.append(articles.get(new))
        urls = urls[articles_left:]
    # print("valid_articles", len(valid_articles))
    return {
        "search_term": search_term,
        "start": start,
        "end": start if end is None else end,
        "articles": valid_articles
    }
Beispiel #5
0
def createTBS():
    startDate = datetime.datetime.now()
    endPeriod = inputDate("Insert end period(enter of current day):\n")
    tbs = get_tbs(startDate, endPeriod)
    settings.tbs = tbs