コード例 #1
0
def get_HTML_article(url_opener, article_file, article_url):    
    
    'Get URL HTML'
    print ("Getting HTML article from URL:   " + article_url)
    html_response=url_opener.open(article_url)
    
    'Build HTML parser'     
    soup = BeautifulSoup(html_response)
    
    'Get the Author'
    article_author_obj=soup.find('a', attrs={"rel": "author"})
    if (article_author_obj != None):
        article_author= article_author_obj.contents
        author = str(article_author[0])
        author_stripped = Scraper.string_cleaner(author)
    else :
        author_stripped = "Unknown"
        
    article_file.write("<author>" + author_stripped  + '</author>\n\n')
    
    'Get the Author'
    article_body=soup.findAll('article')
    
    'Get all paragraphs + clean redundant chars'
    article_file.write("<content>" + "\n")
    
    try:
        for article in article_body:
            for paragraph in article.findAll('p'):
                stripped_p = Scraper.string_cleaner(paragraph)
                article_file.write(stripped_p + "\n")
    except:
        return False

    article_file.write("</content>" + "\n")
    
    return True
    'Get next page - Currently disabled '
コード例 #2
0
def get_HTML_article(url_opener, article_file, article_url):    
    
    'Get URL HTML'
    print ("Getting HTML article from URL:   " + article_url)
    html_response=url_opener.open(article_url)
    
    'Build HTML parser'     
    soup = BeautifulSoup(html_response)
    
    'Get the Author'
    article_author_obj=soup.find('span', attrs={"itemprop": "name"})
    if (article_author_obj != None):
        article_author= article_author_obj.contents
        author_to_parse = article_author[0].split(",", 1)
        author = re.sub(r'\\n', '', str(author_to_parse[0])).strip()
        author_stripped = Scraper.string_cleaner(author)
    else :
        author_stripped = "Unknown"
  
    article_file.write("<author>" + author_stripped +'</author>\n\n')
         
    'Get The Article body'
    article_body=soup.find(attrs={"itemprop": "articleBody"})
    
    'Get all paragraphs + clean redundant chars'
    article_file.write("<content>" + "\n")
    try:
        for paragraph in article_body.findAll('p'):
            stripped_p = Scraper.string_cleaner(paragraph)
            article_file.write(stripped_p + "\n")
    except:
        return False
                  
    article_file.write("</content>" + "\n")
    
    return True

    'Get next page - Currently disabled '
コード例 #3
0
 for result in results_obj:
     try:
         'Get ID'
         try:
             res_id = result['data-fullname']
         except:
             continue
         
         'Get Entry'
         entry_obj = result.find('div', attrs={"class": "entry unvoted"})
         
         'Get title'
         title_obj = (entry_obj.find('p', attrs={"class": "title"})).find('a', attrs={"class": "title "})
         title_parsed = title_obj.contents
         url= title_obj['href']
         title = Scraper.string_cleaner(str(title_parsed[0]))
                           
         'Get Domain'
         domain_obj = entry_obj.find('p', attrs={"class": "title"})
         span_obj = domain_obj.find('span', attrs={"class": "domain"})
         domain_parsed = span_obj.find('a').contents
         domain = Scraper.string_cleaner(str(domain_parsed[0]))
         
         'Subreddit'
         tagline = entry_obj.find('p', attrs={"class": "tagline"})
         hover = tagline.find('a', attrs={"class": "subreddit hover"})
         subredd_parsed = hover.contents
         subredd = Scraper.string_cleaner(str(subredd_parsed[0]))
         subredd = subredd[:-1]    
         
         'Score'
コード例 #4
0
 
     ' Iterate the submissions '
     for sub in submissions:
         try:
             
             'Accept only posts with comments'                  
             if (sub.score < VOTE_TRESHHOLD):
                 continue
             
             'Open File with article id as the name'
             article_file = open(str(sub_reddit) + "\\" + str(article_id), 'w+', newline="\n")
             article_file.write("<article>\n")
             article_file.write("<sub-reddit>" + sub_reddit + "</sub-reddit>\n")
             article_file.write("<news-paper>" + sub.domain + "</news-paper>\n")
             article_file.write("\n")         
             stripped_title = Scraper.string_cleaner(sub.title)
             article_file.write("<title>" + stripped_title  + "</title>\n")
             
             'Get the article content'
             if (SUPPORTED_NEWS_SITES[0] in sub.domain):
                 success = Scraper.ny_times.get_HTML_article(url_opener, article_file, sub.url)
             elif (SUPPORTED_NEWS_SITES[1] in sub.domain):
                 success = Scraper.usa_today.get_HTML_article(url_opener, article_file, sub.url)
             elif (SUPPORTED_NEWS_SITES[2] in sub.domain):
                 success = Scraper.washington_post.get_HTML_article(url_opener, article_file, sub.url)
             else:
                 success = False 
             'Close the XML file'
             article_file.write("</article>\n")
             
             'Found articles counter'