コード例 #1
0
def filterDateRange(articleURLs, DATE_FROM, DATE_END):
    beginIdx = 0
    endIdx = len(articleURLs)
    if endIdx > 30:
        delay(600)
    # filter out everything earlier than the DATE_END
    for i in range(len(articleURLs)):
        currArticle = articleURLs[i]
        soup = backend.scrapeLink(currArticle)
        date = backend.getDatePublished(currArticle, soup)
        date = parseDate(date)
        if date <= DATE_END:
            beginIdx = i
            break

    for i in range(len(articleURLs) - 1, -1, -1):
        currArticle = articleURLs[i]
        soup = backend.scrapeLink(currArticle)
        date = backend.getDatePublished(currArticle, soup)
        date = parseDate(date)
        if i == len(articleURLs) - 1:  # first
            if date >= DATE_FROM:
                endIdx = i
                break
        if date <= DATE_FROM:
            endIdx = i
            break

    return articleURLs[beginIdx:endIdx + 1]
コード例 #2
0
def getFirstArticleDateOnPage(page_URL):
    articles = backend.scrapeLink(page_URL)
    first_article = articles.find('li', {'class': 'first-item'})
    first_article_date = first_article.find('span', {'class': 'date'}).text
    first_article_date_object = datetime.datetime.strptime(
        first_article_date, '%d %b %Y').date()
    return first_article_date_object
コード例 #3
0
def getFirstArticleDateOnPage(page_URL):
    articles = backend.scrapeLink(page_URL)
    article = articles.find('div', {'id': ['content-main']})
    first_article = article.find('h2',
                                 {'class': ['node__title', 'node-title']})
    if not first_article:
        return None
    article_link = first_article.a.get('href')
    if 'todayonline.com' in article_link:
        first_article_url = article_link
    else:
        first_article_url = 'http://www.todayonline.com' + article_link
    first_article_soup = backend.scrapeLink(first_article_url)
    first_article_date = backend.getDatePublished(first_article_url,
                                                  first_article_soup)
    first_article_date_object = main.parseDate(first_article_date)
    return first_article_date_object
コード例 #4
0
ファイル: main.py プロジェクト: Waffleboy/web-scrapers
def downloadOrSkip(URL, category):
    global HTML_SAVE_FOLDER, CSV_NAME
    soup = backend.scrapeLink(URL)
    date = backend.getDatePublished(URL, soup)
    BREAK = False
    try:
        date_object = parseDate(date)
        date_object = date_object.date()
    except:
        raise Exception("Error in making date_object for " + URL)
    if date_object >= DATE_FROM and date_object <= DATE_END:
        infoHash = backend.run(URL)
        storeHTML(URL, infoHash['date'], infoHash['title'], category)
        formatter.formatInputsToCSV(infoHash, CSV_NAME, category)
    elif date_object > DATE_END:
        pass
    elif date_object < DATE_FROM:
        BREAK = True
    else:
        print('Possible Error for downloadOrSkip for URL ' + URL)

    return BREAK, date_object
コード例 #5
0
ファイル: main.py プロジェクト: Waffleboy/web-scrapers
def getArticleTagsOnPage(link):
    soup = backend.scrapeLink(link)
    inner = soup.find('div', {'class': 'archive-section'})
    articles = inner.findAll('h2')
    return articles
コード例 #6
0
def getArticleDivsOnPage(link):
    soup = backend.scrapeLink(link)
    inner = soup.find('div',{'class':'inner'})
    articles = inner.findAll('h2',{'class':['node__title','node-title']})
    return articles
コード例 #7
0
def getArticleTagsOnPage(link):
    global BASE_LINK
    soup = backend.scrapeLink(link)
    inner = soup.find('div', {'id': 'content-main'})
    articles = inner.findAll('h2', {'class': ['node__title', 'node-title']})
    return articles