def filterDateRange(articleURLs, DATE_FROM, DATE_END): beginIdx = 0 endIdx = len(articleURLs) if endIdx > 30: delay(600) # filter out everything earlier than the DATE_END for i in range(len(articleURLs)): currArticle = articleURLs[i] soup = backend.scrapeLink(currArticle) date = backend.getDatePublished(currArticle, soup) date = parseDate(date) if date <= DATE_END: beginIdx = i break for i in range(len(articleURLs) - 1, -1, -1): currArticle = articleURLs[i] soup = backend.scrapeLink(currArticle) date = backend.getDatePublished(currArticle, soup) date = parseDate(date) if i == len(articleURLs) - 1: # first if date >= DATE_FROM: endIdx = i break if date <= DATE_FROM: endIdx = i break return articleURLs[beginIdx:endIdx + 1]
def getFirstArticleDateOnPage(page_URL): articles = backend.scrapeLink(page_URL) first_article = articles.find('li', {'class': 'first-item'}) first_article_date = first_article.find('span', {'class': 'date'}).text first_article_date_object = datetime.datetime.strptime( first_article_date, '%d %b %Y').date() return first_article_date_object
def getFirstArticleDateOnPage(page_URL): articles = backend.scrapeLink(page_URL) article = articles.find('div', {'id': ['content-main']}) first_article = article.find('h2', {'class': ['node__title', 'node-title']}) if not first_article: return None article_link = first_article.a.get('href') if 'todayonline.com' in article_link: first_article_url = article_link else: first_article_url = 'http://www.todayonline.com' + article_link first_article_soup = backend.scrapeLink(first_article_url) first_article_date = backend.getDatePublished(first_article_url, first_article_soup) first_article_date_object = main.parseDate(first_article_date) return first_article_date_object
def downloadOrSkip(URL, category): global HTML_SAVE_FOLDER, CSV_NAME soup = backend.scrapeLink(URL) date = backend.getDatePublished(URL, soup) BREAK = False try: date_object = parseDate(date) date_object = date_object.date() except: raise Exception("Error in making date_object for " + URL) if date_object >= DATE_FROM and date_object <= DATE_END: infoHash = backend.run(URL) storeHTML(URL, infoHash['date'], infoHash['title'], category) formatter.formatInputsToCSV(infoHash, CSV_NAME, category) elif date_object > DATE_END: pass elif date_object < DATE_FROM: BREAK = True else: print('Possible Error for downloadOrSkip for URL ' + URL) return BREAK, date_object
def getArticleTagsOnPage(link): soup = backend.scrapeLink(link) inner = soup.find('div', {'class': 'archive-section'}) articles = inner.findAll('h2') return articles
def getArticleDivsOnPage(link): soup = backend.scrapeLink(link) inner = soup.find('div',{'class':'inner'}) articles = inner.findAll('h2',{'class':['node__title','node-title']}) return articles
def getArticleTagsOnPage(link): global BASE_LINK soup = backend.scrapeLink(link) inner = soup.find('div', {'id': 'content-main'}) articles = inner.findAll('h2', {'class': ['node__title', 'node-title']}) return articles