def download(url): ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile(kCookieFile): cookieJar.load(kCookieFile) else: cookieJar.save(kCookieFile) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) link = opener.open(url) page = link.read() soup = bs(page) title = cleanText(soup.findChild('title').text) print print title print print url print try: author = soup.findChild('span', {'class': 'author vcard'}) print author.getText(" ") except: pass try: timestamp = soup.findChild('span', {'class': 'timestamp'}) print timestamp.getText(" ") except: pass print # grab the text and print all the paragraphs text = soup.findChild(None, {'class': 'entry-content'}) if None == text: text = soup.findChild( None, {'class': 'featured_story_right_content_no_image'}) paras = text.findAll() for p in paras: if 'p' == p.name[0] or 'h' == p.name[0]: print cleanText(p.getText(" ")) print elif 'td' == p.name: txt = cleanText(p.getText(" ")) if len(txt) > 1: print txt print li = p.findChildren('li') if None != li: for anItem in li: print "o %s" % cleanText(anItem.getText(" ")) print
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) title = cleanText( soup.findChild( 'title' ).text ) print title print print url print items = soup.findChildren( None, { 'class' : 'wrapper content' }) # grab the text and print all the paragraphs # a special tag called article has some stuff we might want at some point for anItem in items : paras = anItem.findAll() for p in paras : #if 'p' == p.name[0] or 'h' == p.name[0] : if 'p' == p.name[0] : print cleanText( p.getText( " " )) print
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) title = cleanText( soup.findChild( 'title' ).text ) print print title print print url print try : author = soup.findChild( 'span', { 'class' : 'author vcard' }) print author.getText( " " ) except : pass try : timestamp = soup.findChild( 'span', { 'class' : 'timestamp' }) print timestamp.getText( " " ) except : pass print # grab the text and print all the paragraphs text = soup.findChild( None, { 'class' : 'entry-content' }) if None == text : text = soup.findChild( None, { 'class' : 'featured_story_right_content_no_image' }) paras = text.findAll() for p in paras : if 'p' == p.name[0] or 'h' == p.name[0] : print cleanText( p.getText( " " )) print elif 'td' == p.name : txt = cleanText( p.getText( " " )) if len( txt ) > 1 : print txt print li = p.findChildren( 'li' ) if None != li : for anItem in li : print "o %s" % cleanText( anItem.getText( " " )) print
def download(url): ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile(kCookieFile): cookieJar.load(kCookieFile) else: cookieJar.save(kCookieFile) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) link = opener.open(url) page = link.read() soup = bs(page) title = cleanText(soup.findChild('title').text) print print title print print url print #dumpDivsIdClass( soup ) #dumpDivs( soup ) meta = soup.findChild('div', {'id': 'story_meta'}) try: author = meta.findChild('span', {'class': 'last'}) print author.getText(" ") except: pass try: timestamp = meta.findChild('span', {'class': 'datestamp'}) print timestamp.getText(" ") except: pass print # grab the text and print all the paragraphs text = soup.findChild(None, {'id': 'story_display'}) if None == text: text = soup.findChild(None, {'id': 'story_body'}) paras = text.findAll() for p in paras: if 'p' == p.name[0] or 'h' == p.name[0]: print cleanText(p.getText(" ")) print li = p.findChildren('li') if None != li: for anItem in li: print "o %s" % cleanText(anItem.getText(" ")) print
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) title = cleanText( soup.findChild( 'title' ).text ) print print title print print url print #dumpDivsIdClass( soup ) #dumpDivs( soup ) meta = soup.findChild( 'div', { 'id' : 'story_meta' }) try : author = meta.findChild( 'span', { 'class' : 'last' }) print author.getText( " " ) except : pass try : timestamp = meta.findChild( 'span', { 'class' : 'datestamp' }) print timestamp.getText( " " ) except : pass print # grab the text and print all the paragraphs text = soup.findChild( None, { 'id' : 'story_display' }) if None == text : text = soup.findChild( None, { 'id' : 'story_body' }) paras = text.findAll() for p in paras : if 'p' == p.name[0] or 'h' == p.name[0] : print cleanText( p.getText( " " )) print li = p.findChildren( 'li' ) if None != li : for anItem in li : print "o %s" % cleanText( anItem.getText( " " )) print
def download(url, skipFuture=True, skipFinished=True): ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile(kCookieFile): cookieJar.load(kCookieFile) else: cookieJar.save(kCookieFile) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) link = opener.open(url) page = link.read() soup = bs(page) print 'Soup!' # debug scores = soup.findChildren('table', {"class": "scores"}) print 'scores!' # debug for i, aSection in enumerate(scores): print aSection scoresArray = [] away = "" home = "" teams = aSection.findChildren(None, {"class": "yspscores team"}) for i, aTeam in enumerate(teams): name = aTeam.findChild('a') if 0 == i: away = name.text else: home = name.text qtrScores = aSection.findChildren(None, {"class": "yspscores"}) for i, qtr in enumerate(qtrScores): scoresArray.append(cleanText(qtr.text)) description = "" if not isFuture(scoresArray) and not isFinal(scoresArray): #find the 2nd table and grab the content! tabs = aSection.parent.findChildren('table') descTable = tabs[1] desc = descTable.findChildren(None, {"class": "yspscores"})[0] description = cleanText(desc.getText(" ")) printScores(away, home, scoresArray, description, skipFuture, skipFinished)
def download( url, skipFuture=True, skipFinished=True ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) print 'Soup!' # debug scores = soup.findChildren( 'table', { "class" : "scores" } ) print 'scores!' # debug for i, aSection in enumerate( scores ) : print aSection scoresArray = [] away = "" home = "" teams = aSection.findChildren( None, { "class" : "yspscores team" } ) for i, aTeam in enumerate( teams ) : name = aTeam.findChild( 'a' ) if 0 == i : away = name.text else : home = name.text qtrScores = aSection.findChildren( None, { "class" : "yspscores" } ) for i, qtr in enumerate( qtrScores ) : scoresArray.append( cleanText( qtr.text )) description = "" if not isFuture( scoresArray ) and not isFinal( scoresArray ) : #find the 2nd table and grab the content! tabs = aSection.parent.findChildren( 'table' ) descTable = tabs[ 1 ] desc = descTable.findChildren( None, { "class" : "yspscores" } )[0] description = cleanText( desc.getText( " " )) printScores( away, home, scoresArray, description, skipFuture, skipFinished )
def fixOdds(awayOdds, homeOdds): ''' fixOdds needs a description... ''' awayOdds = cleanText(fixOddsStr(awayOdds)) homeOdds = cleanText(fixOddsStr(homeOdds)) if '-' == awayOdds[0]: awayOdds = awayOdds.split(" ")[0] homeOdds = awayOdds[1:] else: homeOdds = homeOdds.split(" ")[0] awayOdds = homeOdds[1:] return awayOdds, homeOdds
def fixOdds(awayOdds, homeOdds): """ fixOdds needs a description... """ awayOdds = cleanText(fixOddsStr(awayOdds)) homeOdds = cleanText(fixOddsStr(homeOdds)) if "-" == awayOdds[0]: awayOdds = awayOdds.split(" ")[0] homeOdds = awayOdds[1:] else: homeOdds = homeOdds.split(" ")[0] awayOdds = homeOdds[1:] return awayOdds, homeOdds
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) scores = soup.findChildren( 'table', { "class" : "scores" } ) for i, aSection in enumerate( scores ) : scoresArray = [] away = "" home = "" teams = aSection.findChildren( None, { "class" : "yspscores team" } ) for i, aTeam in enumerate( teams ) : name = aTeam.findChild( 'a' ) if 0 == i : away = name.text else : home = name.text qtrScores = aSection.findChildren( None, { "class" : "yspscores" } ) for i, qtr in enumerate( qtrScores ) : scoresArray.append( cleanText( qtr.text )) printScores( away, home, scoresArray )
def doThreadComments( soup ) : ''' doThreadComments needs a description... ''' commentBlock = soup.findChild( None, { "class" : "posts" }) commentRows = commentBlock.findAll( None, { "class" : "postbit postbitim postcontainer old" }) for i, commentRow in enumerate( commentRows ) : # print commentRow userObj = commentRow.findChild( None, { "class" : "popupmenu memberaction" }) poster = userObj.findChild( None, { "class" : re.compile( 'username' ) } ) poster = cleanMsg( poster ) date = cleanMsg( commentRow.findChild( None, { "class" : "date" })) date = date.replace( " ", " " ) print poster print date print # brute force strip all HTML data from message for now msgObj = commentRow.findChild( None, { "class" : "postcontent restore" }) #msg = ''.join( bs( str( msgObj ) ).findAll( text=True )).strip() msg = cleanText( ''.join( bs( str( msgObj ) ).findAll( text=True )).strip() ) print msg.encode( 'ascii', 'ignore' ) print " =============================="
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) title = cleanText( soup.findChild( 'title' ).text ) print print title print print url print try : author = soup.findChild( 'div', { 'id' : 'ody-byline-written-by' }) print cleanAuthor( author ) timestamp = soup.findChild( 'div', { 'class' : 'ody-arttime' }) print cleanTimeStamp( timestamp ) print except : pass # grab the text and print all the paragraphs text = soup.findChild( None, { 'class' : 'gel-content' }) paras = text.findAll() #paras = text.findAll( 'p' ) for p in paras : if 'p' == p.name[0] or 'h' == p.name[0] : outText = cleanText( p.getText( " " )).rstrip() if len( outText ) > 0 : print outText print
def scoreSummary( soup ) : ''' scoreSummary needs a description... ''' scores = soup.findChildren( None, { 'id' : 'ysp-reg-box-game_details-scoring_summary' } ) scoreItems = scores[ 0 ].findChildren( recursive=False ) for i, item in enumerate( scoreItems ) : if 0 == i : print item.getText( " " ) else : children = item.findChildren( recursive=False ) for child in children : if 'h' == child.name[ 0 ] : print cleanText( child.getText( " " )) elif 'table' == child.name : printTableArray( tableToArray( child )) print
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' import re cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) title = cleanText( soup.findChild( 'title' ).text ) print print title print print url print #print soup # grab the text and print all the paragraphs text = soup.findChild( None, { 'id' : 'article_story_body', 'class' : 'article story' }) if None is text : text = soup.findChild( 'div', { 'data-module-name' : 'resp.module.article.articleBody' }) paras = text.findAll() for p in paras : if 'p' == p.name[0] or 'h' == p.name[0] : outText = cleanText( p.getText( " " )) outText = outText.lstrip().rstrip() outText = re.sub( r'[\n]+', r' ', outText ) outText = re.sub( r' [\s]+', r' ', outText ) if len( outText ) > 0 : print outText print
def scoreSummary(soup): ''' scoreSummary needs a description... ''' scores = soup.findChildren( None, {'id': 'ysp-reg-box-game_details-scoring_summary'}) scoreItems = scores[0].findChildren(recursive=False) for i, item in enumerate(scoreItems): if 0 == i: print item.getText(" ") else: children = item.findChildren(recursive=False) for child in children: if 'h' == child.name[0]: print cleanText(child.getText(" ")) elif 'table' == child.name: printTableArray(tableToArray(child)) print
def download(url): ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile(kCookieFile): cookieJar.load(kCookieFile) else: cookieJar.save(kCookieFile) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) link = opener.open(url) page = link.read() soup = bs(page) title = cleanText(soup.findChild('title').text) print title print print url print # grab the text and print all the paragraphs top = soup.findChild(None, {'id': 'content'}) text = soup.findChild(None, {'class': 'entry clearfix'}) paras = text.findAll() for p in paras: if 'p' == p.name[0] or 'h' == p.name[0]: print cleanText(p.getText(" ")) print li = p.findChildren('li') if None != li: for anItem in li: print "o %s" % cleanText(anItem.getText(" ")) print printEndLine() # comments comments = top.findChildren(None, {'class': 'comment-body'}) for aComment in comments: author = aComment.findChild(None, { 'class': 'comment-author vcard' }).getText(" ").rstrip() author = author.lstrip() date = aComment.findChild(None, { 'class': 'comment-meta commentmetadata' }).getText(" ").rstrip() print "On ", date print author print for p in aComment: if not isinstance(p, ns): if 'p' == p.name[0]: print cleanText(p.getText(" ")) print printEndLine()
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) title = cleanText( soup.findChild( 'title' ).text ) print title print print url print # grab the text and print all the paragraphs text = soup.findChild( None, { 'class' : 'articleText' }) paras = text.findAll() for p in paras : if 'p' == p.name[0] or 'h' == p.name[0] : childScript = p.findChildren( 'script' ) if not childScript : print cleanText( p.getText( " " )) print li = p.findChildren( 'li' ) if None != li : for anItem in li : print "o %s" % cleanText( anItem.getText( " " )) print
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' soup = loadPage( url ) # print soup title = cleanText( soup.findChild( 'title' ).text ) print title print print url print main = soup.findChild( None, { "class" : "articleContent" }) paras = main.findAll() for p in paras : if 'p' == p.name[0] or 'h' == p.name[0] : print cleanText( p.getText( ' ' )) print
def download(url): ''' Pull the page and parse it into the pieces we need. ''' soup = loadPage(url) # print soup title = cleanText(soup.findChild('title').text) print title print print url print main = soup.findChild(None, {"class": "articleContent"}) paras = main.findAll() for p in paras: if 'p' == p.name[0] or 'h' == p.name[0]: print cleanText(p.getText(' ')) print
def download(url): ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile(kCookieFile): cookieJar.load(kCookieFile) else: cookieJar.save(kCookieFile) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) link = opener.open(url) page = link.read() soup = bs(page) title = cleanText(soup.findChild('title').text) print title print print url print # grab the text and print all the paragraphs text = soup.findChild(None, {'class': 'articleText'}) paras = text.findAll() for p in paras: if 'p' == p.name[0] or 'h' == p.name[0]: childScript = p.findChildren('script') if not childScript: print cleanText(p.getText(" ")) print li = p.findChildren('li') if None != li: for anItem in li: print "o %s" % cleanText(anItem.getText(" ")) print
def tableToArray( tableSoup ) : ''' tableToArray needs a description... ''' table = [] rows = tableSoup.findChildren( 'tbody' )[ 0 ].findChildren( 'tr' ) for row in rows : rowArray = [] tds = row.findChildren( 'td' ) for td in tds : rowArray.append( cleanText( td.getText( ' ' ))) table.append( rowArray ) return table
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) title = cleanText( soup.findChild( 'title' ).text ) print title print print url print # grab the text and print all the paragraphs top = soup.findChild( None, { 'id' : 'content' }) text = soup.findChild( None, { 'class' : 'entry clearfix' }) paras = text.findAll() for p in paras : if 'p' == p.name[ 0 ] or 'h' == p.name[ 0 ] : print cleanText( p.getText( " " )) print li = p.findChildren( 'li' ) if None != li : for anItem in li : print "o %s" % cleanText( anItem.getText( " " )) print printEndLine() # comments comments = top.findChildren( None, { 'class' : 'comment-body' }) for aComment in comments : author = aComment.findChild( None, { 'class' : 'comment-author vcard' }).getText( " " ).rstrip() author = author.lstrip() date = aComment.findChild( None, { 'class' : 'comment-meta commentmetadata' }).getText( " " ).rstrip() print "On ", date print author print for p in aComment : if not isinstance( p, ns ) : if 'p' == p.name[ 0 ] : print cleanText( p.getText( " " )) print printEndLine()
def tableToArray(tableSoup): ''' tableToArray needs a description... ''' table = [] rows = tableSoup.findChildren('tbody')[0].findChildren('tr') for row in rows: rowArray = [] tds = row.findChildren('td') for td in tds: rowArray.append(cleanText(td.getText(' '))) table.append(rowArray) return table
def dumpGameStats( rowsList, year, isPlayoffs = False ) : ''' dumpGameStats needs a description... Note that we don't grab the last item in the list, where Yahoo sticks a total ''' finalList = [] for i, row in enumerate( rowsList[ : -1 ] ) : rowClass = None try : rowClass = row['class'] except : pass if 'column' != rowClass : dataList = [] datums = row.findChildren( 'td' ) if len( datums ) > 0 : if isPlayoffs : dataList.append( '"%s"' % str( i + 17 )) for aDatum in datums : dataList.append( '"%s"' % cleanText( aDatum.text )) # now we need to repair data in various ways date = dataList[ 1 ] date = '%s %s"' % ( date[ : -1 ], year ) dataList[ 1 ] = date # split the result, PF and PA into seperate items # it may be empty if the game is unplayed result = dataList[ 3 ][ 1 : -1 ] if "" != result : wlPoints = result.split( " " ) dataList[ 3 ] = '"%s"' % wlPoints[ 0 ] points = wlPoints[ 1 ].split( '-' ) dataList.insert( 4, '"%s"' % points[ 0 ] ) dataList.insert( 5, '"%s"' % points[ 1 ] ) else : dataList.insert( 4, '""' ) dataList.insert( 4, '""' ) finalList.append( dataList ) for i, aWeek in enumerate( finalList ) : if len( aWeek ) > 0 : print ",".join( aWeek )
def download(url): ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile(kCookieFile): cookieJar.load(kCookieFile) else: cookieJar.save(kCookieFile) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) link = opener.open(url) page = link.read() soup = bs(page) title = cleanText(soup.findChild('title').text) print title print print url print author = soup.findChildren(None, {'class': 'byline'}) if author is not None: try: print cleanText(author[0].getText(" ")) except: pass date = soup.findChildren(None, {'class': 'dateline'}) if date is not None: print cleanText(date[0].getText(" ")) print # should find a better starting point... past the end of the first group of items... moreItems = soup.findChildren(None, {'itemprop': 'articleBody'}) if None != moreItems: for anItem in moreItems: if 'p' == anItem.name[0] or 'h' == anItem.name[0]: print cleanText(anItem.getText(" ")) print
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) title = cleanText( soup.findChild( 'title' ).text ) print title print print url print author = soup.findChildren( None, { 'class' : 'byline' }) if author is not None : try : print cleanText( author[ 0 ].getText( " " )) except : pass date = soup.findChildren( None, { 'class' : 'dateline' }) if date is not None : print cleanText( date[ 0 ].getText( " " )) print # should find a better starting point... past the end of the first group of items... moreItems = soup.findChildren( None, { 'itemprop' : 'articleBody' }) if None != moreItems : for anItem in moreItems : if 'p' == anItem.name[0] or 'h' == anItem.name[0] : print cleanText( anItem.getText( " " )) print