Python Scraper.string_cleanerの例

プログラミング言語: Python

クラス/型: Scraper

メソッド/関数: string_cleaner

hotexamples.comのコード掲載数: 4

Python Scraper.string_cleaner - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのScraper.string_cleaner パッケージから privacyflash-proの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Scraper(29)

string_cleaner(4)

SCRAPE(2)

playername_to_id(2)

playerid_to_playerName(2)

getVoteDict(2)

Stock(2)

getMargins(2)

getInjuredPlayers(1)

scrapeByRegex(1)

scrapeByAnchorTag(1)

getProjStarters(1)

getNewGameIDs(1)

ApsaaScraper(1)

getActions(1)

filterActions(1)

GoogleScraper(1)

createPlayerMap(1)

StartCollectingData(1)

Scraping(1)

Scrape(1)

Scrap(1)

Page(1)

Instance(1)

create_todays_playerMap(1)

コード例 #1

ファイルを表示

ファイル: washington_post.py プロジェクト: dshaish/info_systems_seminar

def get_HTML_article(url_opener, article_file, article_url):    
    
    'Get URL HTML'
    print ("Getting HTML article from URL:   " + article_url)
    html_response=url_opener.open(article_url)
    
    'Build HTML parser'     
    soup = BeautifulSoup(html_response)
    
    'Get the Author'
    article_author_obj=soup.find('a', attrs={"rel": "author"})
    if (article_author_obj != None):
        article_author= article_author_obj.contents
        author = str(article_author[0])
        author_stripped = Scraper.string_cleaner(author)
    else :
        author_stripped = "Unknown"
        
    article_file.write("<author>" + author_stripped  + '</author>\n\n')
    
    'Get the Author'
    article_body=soup.findAll('article')
    
    'Get all paragraphs + clean redundant chars'
    article_file.write("<content>" + "\n")
    
    try:
        for article in article_body:
            for paragraph in article.findAll('p'):
                stripped_p = Scraper.string_cleaner(paragraph)
                article_file.write(stripped_p + "\n")
    except:
        return False

    article_file.write("</content>" + "\n")
    
    return True
    'Get next page - Currently disabled '

コード例 #2

ファイルを表示

ファイル: usa_today.py プロジェクト: dshaish/info_systems_seminar

def get_HTML_article(url_opener, article_file, article_url):    
    
    'Get URL HTML'
    print ("Getting HTML article from URL:   " + article_url)
    html_response=url_opener.open(article_url)
    
    'Build HTML parser'     
    soup = BeautifulSoup(html_response)
    
    'Get the Author'
    article_author_obj=soup.find('span', attrs={"itemprop": "name"})
    if (article_author_obj != None):
        article_author= article_author_obj.contents
        author_to_parse = article_author[0].split(",", 1)
        author = re.sub(r'\\n', '', str(author_to_parse[0])).strip()
        author_stripped = Scraper.string_cleaner(author)
    else :
        author_stripped = "Unknown"
  
    article_file.write("<author>" + author_stripped +'</author>\n\n')
         
    'Get The Article body'
    article_body=soup.find(attrs={"itemprop": "articleBody"})
    
    'Get all paragraphs + clean redundant chars'
    article_file.write("<content>" + "\n")
    try:
        for paragraph in article_body.findAll('p'):
            stripped_p = Scraper.string_cleaner(paragraph)
            article_file.write(stripped_p + "\n")
    except:
        return False
                  
    article_file.write("</content>" + "\n")
    
    return True

    'Get next page - Currently disabled '

コード例 #3

ファイルを表示

ファイル: reddit_site_miner.py プロジェクト: dshaish/info_systems_seminar

 for result in results_obj:
     try:
         'Get ID'
         try:
             res_id = result['data-fullname']
         except:
             continue
         
         'Get Entry'
         entry_obj = result.find('div', attrs={"class": "entry unvoted"})
         
         'Get title'
         title_obj = (entry_obj.find('p', attrs={"class": "title"})).find('a', attrs={"class": "title "})
         title_parsed = title_obj.contents
         url= title_obj['href']
         title = Scraper.string_cleaner(str(title_parsed[0]))
                           
         'Get Domain'
         domain_obj = entry_obj.find('p', attrs={"class": "title"})
         span_obj = domain_obj.find('span', attrs={"class": "domain"})
         domain_parsed = span_obj.find('a').contents
         domain = Scraper.string_cleaner(str(domain_parsed[0]))
         
         'Subreddit'
         tagline = entry_obj.find('p', attrs={"class": "tagline"})
         hover = tagline.find('a', attrs={"class": "subreddit hover"})
         subredd_parsed = hover.contents
         subredd = Scraper.string_cleaner(str(subredd_parsed[0]))
         subredd = subredd[:-1]    
         
         'Score'

コード例 #4

ファイルを表示

ファイル: __init__.py プロジェクト: dshaish/info_systems_seminar

 
     ' Iterate the submissions '
     for sub in submissions:
         try:
             
             'Accept only posts with comments'                  
             if (sub.score < VOTE_TRESHHOLD):
                 continue
             
             'Open File with article id as the name'
             article_file = open(str(sub_reddit) + "\\" + str(article_id), 'w+', newline="\n")
             article_file.write("<article>\n")
             article_file.write("<sub-reddit>" + sub_reddit + "</sub-reddit>\n")
             article_file.write("<news-paper>" + sub.domain + "</news-paper>\n")
             article_file.write("\n")         
             stripped_title = Scraper.string_cleaner(sub.title)
             article_file.write("<title>" + stripped_title  + "</title>\n")
             
             'Get the article content'
             if (SUPPORTED_NEWS_SITES[0] in sub.domain):
                 success = Scraper.ny_times.get_HTML_article(url_opener, article_file, sub.url)
             elif (SUPPORTED_NEWS_SITES[1] in sub.domain):
                 success = Scraper.usa_today.get_HTML_article(url_opener, article_file, sub.url)
             elif (SUPPORTED_NEWS_SITES[2] in sub.domain):
                 success = Scraper.washington_post.get_HTML_article(url_opener, article_file, sub.url)
             else:
                 success = False 
             'Close the XML file'
             article_file.write("</article>\n")
             
             'Found articles counter'