Python cleanTextの例、misc.cleanText Pythonの例

コード例 #1

0

ファイルを表示

ファイル: js.py プロジェクト: hslawson/odds-scraper

def download(url):
    '''
		Pull the page and parse it into the pieces we need.
	'''
    cookieJar = cookielib.LWPCookieJar()
    if os.path.isfile(kCookieFile):
        cookieJar.load(kCookieFile)
    else:
        cookieJar.save(kCookieFile)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))

    link = opener.open(url)

    page = link.read()
    soup = bs(page)

    title = cleanText(soup.findChild('title').text)

    print
    print title
    print
    print url
    print

    try:
        author = soup.findChild('span', {'class': 'author vcard'})
        print author.getText(" ")
    except:
        pass

    try:
        timestamp = soup.findChild('span', {'class': 'timestamp'})
        print timestamp.getText(" ")
    except:
        pass

    print

    # grab the text and print all the paragraphs
    text = soup.findChild(None, {'class': 'entry-content'})

    if None == text:
        text = soup.findChild(
            None, {'class': 'featured_story_right_content_no_image'})

    paras = text.findAll()
    for p in paras:
        if 'p' == p.name[0] or 'h' == p.name[0]:
            print cleanText(p.getText(" "))
            print
        elif 'td' == p.name:
            txt = cleanText(p.getText(" "))
            if len(txt) > 1:
                print txt
                print
        li = p.findChildren('li')
        if None != li:
            for anItem in li:
                print "o %s" % cleanText(anItem.getText(" "))
                print

コード例 #2

0

ファイルを表示

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	title = cleanText( soup.findChild( 'title' ).text )

	print title
	print
	print url
	print

	items = soup.findChildren( None, { 'class' : 'wrapper content' })

	# grab the text and print all the paragraphs
	# a special tag called article has some stuff we might want at some point
	for anItem in items :
		paras = anItem.findAll()
		for p in paras :
			#if 'p' == p.name[0] or 'h' == p.name[0] :
			if 'p' == p.name[0] :
				print cleanText( p.getText( " " ))
				print

コード例 #3

0

ファイルを表示

ファイル: js.py プロジェクト: dijatool/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	title = cleanText( soup.findChild( 'title' ).text )

	print
	print title
	print
	print url
	print

	try :
		author = soup.findChild( 'span', { 'class' : 'author vcard' })
		print author.getText( " " )
	except :
		pass

	try :
		timestamp = soup.findChild( 'span', { 'class' : 'timestamp' })
		print timestamp.getText( " " )
	except :
		pass

	print

	# grab the text and print all the paragraphs
	text = soup.findChild( None, { 'class' : 'entry-content' })

	if None == text :
		text = soup.findChild( None, { 'class' : 'featured_story_right_content_no_image' })

	paras = text.findAll()
	for p in paras :
		if 'p' == p.name[0] or 'h' == p.name[0] :
			print cleanText( p.getText( " " ))
			print
		elif 'td' ==  p.name :
			txt = cleanText( p.getText( " " ))
			if len( txt ) > 1 :
				print txt
				print
		li = p.findChildren( 'li' )
		if None != li :
			for anItem in li :
				print "o %s" % cleanText( anItem.getText( " " ))
				print

コード例 #4

0

ファイルを表示

def download(url):
    '''
		Pull the page and parse it into the pieces we need.
	'''
    cookieJar = cookielib.LWPCookieJar()
    if os.path.isfile(kCookieFile):
        cookieJar.load(kCookieFile)
    else:
        cookieJar.save(kCookieFile)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))

    link = opener.open(url)

    page = link.read()
    soup = bs(page)

    title = cleanText(soup.findChild('title').text)

    print
    print title
    print
    print url
    print

    #dumpDivsIdClass( soup )
    #dumpDivs( soup )

    meta = soup.findChild('div', {'id': 'story_meta'})
    try:
        author = meta.findChild('span', {'class': 'last'})
        print author.getText(" ")
    except:
        pass

    try:
        timestamp = meta.findChild('span', {'class': 'datestamp'})
        print timestamp.getText(" ")
    except:
        pass

    print

    # grab the text and print all the paragraphs
    text = soup.findChild(None, {'id': 'story_display'})

    if None == text:
        text = soup.findChild(None, {'id': 'story_body'})

    paras = text.findAll()
    for p in paras:
        if 'p' == p.name[0] or 'h' == p.name[0]:
            print cleanText(p.getText(" "))
            print
        li = p.findChildren('li')
        if None != li:
            for anItem in li:
                print "o %s" % cleanText(anItem.getText(" "))
                print

コード例 #5

0

ファイルを表示

ファイル: bloom.py プロジェクト: dijatool/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	title = cleanText( soup.findChild( 'title' ).text )

	print
	print title
	print
	print url
	print

	#dumpDivsIdClass( soup )
	#dumpDivs( soup )

	meta = soup.findChild( 'div', { 'id' : 'story_meta' })
	try :
		author = meta.findChild( 'span', { 'class' : 'last' })
		print author.getText( " " )
	except :
		pass

	try :
		timestamp = meta.findChild( 'span', { 'class' : 'datestamp' })
		print timestamp.getText( " " )
	except :
		pass

	print

	# grab the text and print all the paragraphs
	text = soup.findChild( None, { 'id' : 'story_display' })

	if None == text :
		text = soup.findChild( None, { 'id' : 'story_body' })

	paras = text.findAll()
	for p in paras :
		if 'p' == p.name[0] or 'h' == p.name[0] :
			print cleanText( p.getText( " " ))
			print
		li = p.findChildren( 'li' )
		if None != li :
			for anItem in li :
				print "o %s" % cleanText( anItem.getText( " " ))
				print

コード例 #6

0

ファイルを表示

def download(url, skipFuture=True, skipFinished=True):
    '''
		Pull the page and parse it into the pieces we need.
	'''
    cookieJar = cookielib.LWPCookieJar()
    if os.path.isfile(kCookieFile):
        cookieJar.load(kCookieFile)
    else:
        cookieJar.save(kCookieFile)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))

    link = opener.open(url)

    page = link.read()
    soup = bs(page)

    print 'Soup!'  # debug

    scores = soup.findChildren('table', {"class": "scores"})

    print 'scores!'  # debug

    for i, aSection in enumerate(scores):

        print aSection

        scoresArray = []
        away = ""
        home = ""
        teams = aSection.findChildren(None, {"class": "yspscores team"})

        for i, aTeam in enumerate(teams):
            name = aTeam.findChild('a')
            if 0 == i:
                away = name.text
            else:
                home = name.text
        qtrScores = aSection.findChildren(None, {"class": "yspscores"})
        for i, qtr in enumerate(qtrScores):
            scoresArray.append(cleanText(qtr.text))

        description = ""
        if not isFuture(scoresArray) and not isFinal(scoresArray):
            #find the 2nd table and grab the content!
            tabs = aSection.parent.findChildren('table')
            descTable = tabs[1]
            desc = descTable.findChildren(None, {"class": "yspscores"})[0]
            description = cleanText(desc.getText(" "))

        printScores(away, home, scoresArray, description, skipFuture,
                    skipFinished)

コード例 #7

0

ファイルを表示

ファイル: scores.py プロジェクト: dijatool/odds-scraper

def download( url, skipFuture=True, skipFinished=True ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	print 'Soup!'	# debug

	scores = soup.findChildren( 'table', { "class" : "scores" } )

	print 'scores!'	# debug
	
	for i, aSection in enumerate( scores ) :
	
		print aSection
	
		scoresArray = []
		away = ""
		home = ""
		teams = aSection.findChildren( None, { "class" : "yspscores team" } )
		
		for i, aTeam in enumerate( teams ) :
			name = aTeam.findChild( 'a' )
			if 0 == i :
				away = name.text
			else :
				home = name.text
		qtrScores = aSection.findChildren( None, { "class" : "yspscores" } )
		for i, qtr in enumerate( qtrScores ) :
			scoresArray.append( cleanText( qtr.text ))

		description = ""
		if not isFuture( scoresArray ) and not isFinal( scoresArray ) :
			#find the 2nd table and grab the content!
			tabs = aSection.parent.findChildren( 'table' )
			descTable = tabs[ 1 ]
			desc = descTable.findChildren( None, { "class" : "yspscores" } )[0]
			description = cleanText( desc.getText( " " ))

		printScores( away, home, scoresArray, description, skipFuture, skipFinished )

コード例 #8

0

ファイルを表示

ファイル: vi.py プロジェクト: hslawson/odds-scraper

def fixOdds(awayOdds, homeOdds):
    '''
		fixOdds needs a description...

	'''
    awayOdds = cleanText(fixOddsStr(awayOdds))
    homeOdds = cleanText(fixOddsStr(homeOdds))
    if '-' == awayOdds[0]:
        awayOdds = awayOdds.split(" ")[0]
        homeOdds = awayOdds[1:]
    else:
        homeOdds = homeOdds.split(" ")[0]
        awayOdds = homeOdds[1:]

    return awayOdds, homeOdds

コード例 #9

0

ファイルを表示

ファイル: vi.py プロジェクト: dijatool/odds-scraper

def fixOdds(awayOdds, homeOdds):
    """
		fixOdds needs a description...

	"""
    awayOdds = cleanText(fixOddsStr(awayOdds))
    homeOdds = cleanText(fixOddsStr(homeOdds))
    if "-" == awayOdds[0]:
        awayOdds = awayOdds.split(" ")[0]
        homeOdds = awayOdds[1:]
    else:
        homeOdds = homeOdds.split(" ")[0]
        awayOdds = homeOdds[1:]

    return awayOdds, homeOdds

コード例 #10

0

ファイルを表示

ファイル: scores.py プロジェクト: hslawson/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )


	scores = soup.findChildren( 'table', { "class" : "scores" } )

	for i, aSection in enumerate( scores ) :
		scoresArray = []
		away = ""
		home = ""
		teams = aSection.findChildren( None, { "class" : "yspscores team" } )
		for i, aTeam in enumerate( teams ) :
			name = aTeam.findChild( 'a' )
			if 0 == i :
				away = name.text
			else :
				home = name.text
		qtrScores = aSection.findChildren( None, { "class" : "yspscores" } )
		for i, qtr in enumerate( qtrScores ) :
			scoresArray.append( cleanText( qtr.text ))

		printScores( away, home, scoresArray )

コード例 #11

0

ファイルを表示

ファイル: scrape-fp.py プロジェクト: hslawson/odds-scraper

def doThreadComments( soup ) :
	'''
		doThreadComments needs a description...

	'''
	commentBlock = soup.findChild( None, { "class" : "posts" })
	commentRows = commentBlock.findAll( None, { "class" : "postbit postbitim postcontainer old" })
	for i, commentRow in enumerate( commentRows ) :
		# print commentRow
		userObj = commentRow.findChild( None, { "class" : "popupmenu memberaction" })
		poster = userObj.findChild( None, { "class" : re.compile( 'username' ) } )
		poster = cleanMsg( poster )

		date = cleanMsg( commentRow.findChild( None, { "class" : "date" }))
		date = date.replace( "&nbsp;", " " )

		print poster
		print date
		print

		# brute force strip all HTML data from message for now
		msgObj = commentRow.findChild( None, { "class" : "postcontent restore" })

		#msg = ''.join( bs( str( msgObj ) ).findAll( text=True )).strip()
		msg = cleanText( ''.join( bs( str( msgObj ) ).findAll( text=True )).strip() )
		
		print msg.encode( 'ascii', 'ignore' )

		print " =============================="

コード例 #12

0

ファイルを表示

ファイル: pg.py プロジェクト: hslawson/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	title = cleanText( soup.findChild( 'title' ).text )

	print
	print title
	print
	print url
	print

	try :
		author = soup.findChild( 'div', { 'id' : 'ody-byline-written-by' })
		print cleanAuthor( author )

		timestamp = soup.findChild( 'div', { 'class' : 'ody-arttime' })
		print cleanTimeStamp( timestamp )

		print
	except :
		pass


	# grab the text and print all the paragraphs
	text = soup.findChild( None, { 'class' : 'gel-content' })

	paras = text.findAll()
	#paras = text.findAll( 'p' )
	for p in paras :
		if 'p' == p.name[0] or 'h' == p.name[0] :
			outText = cleanText( p.getText( " " )).rstrip()
			if len( outText ) > 0 :
				print outText
				print

コード例 #13

0

ファイルを表示

ファイル: ybox.py プロジェクト: dijatool/odds-scraper

def scoreSummary( soup ) :
	'''
		scoreSummary needs a description...

	'''
	scores = soup.findChildren( None, { 'id' : 'ysp-reg-box-game_details-scoring_summary' } )

	scoreItems = scores[ 0 ].findChildren( recursive=False )
	for i, item in enumerate( scoreItems ) :
		if 0 == i :
			print item.getText( " " )
		else :
			children = item.findChildren( recursive=False )
			for child in children :
				if 'h' == child.name[ 0 ] :
					print cleanText( child.getText( " " ))
				elif 'table' == child.name :
					printTableArray( tableToArray( child ))
				print

コード例 #14

0

ファイルを表示

ファイル: wsj.py プロジェクト: hslawson/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	import re
	
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	title = cleanText( soup.findChild( 'title' ).text )
	print
	print title
	print
	print url
	print

	#print soup
	
	# grab the text and print all the paragraphs
	text = soup.findChild( None, { 'id' : 'article_story_body', 'class' : 'article story' })
	if None is text :
		text = soup.findChild( 'div', { 'data-module-name' : 'resp.module.article.articleBody' })

	paras = text.findAll()
	for p in paras :
		if 'p' == p.name[0] or 'h' == p.name[0] :
			outText = cleanText( p.getText( " " ))
			outText = outText.lstrip().rstrip()
			outText = re.sub( r'[\n]+', r' ', outText )
			outText = re.sub( r' [\s]+', r' ', outText )
			if len( outText ) > 0 :
				print outText
				print

コード例 #15

0

ファイルを表示

def scoreSummary(soup):
    '''
		scoreSummary needs a description...

	'''
    scores = soup.findChildren(
        None, {'id': 'ysp-reg-box-game_details-scoring_summary'})

    scoreItems = scores[0].findChildren(recursive=False)
    for i, item in enumerate(scoreItems):
        if 0 == i:
            print item.getText(" ")
        else:
            children = item.findChildren(recursive=False)
            for child in children:
                if 'h' == child.name[0]:
                    print cleanText(child.getText(" "))
                elif 'table' == child.name:
                    printTableArray(tableToArray(child))
                print

コード例 #16

0

ファイルを表示

ファイル: chtv.py プロジェクト: hslawson/odds-scraper

def download(url):
    '''
		Pull the page and parse it into the pieces we need.
	'''
    cookieJar = cookielib.LWPCookieJar()
    if os.path.isfile(kCookieFile):
        cookieJar.load(kCookieFile)
    else:
        cookieJar.save(kCookieFile)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))

    link = opener.open(url)

    page = link.read()
    soup = bs(page)

    title = cleanText(soup.findChild('title').text)

    print title
    print
    print url
    print

    # grab the text and print all the paragraphs
    top = soup.findChild(None, {'id': 'content'})

    text = soup.findChild(None, {'class': 'entry clearfix'})

    paras = text.findAll()
    for p in paras:
        if 'p' == p.name[0] or 'h' == p.name[0]:
            print cleanText(p.getText(" "))
            print
        li = p.findChildren('li')
        if None != li:
            for anItem in li:
                print "o %s" % cleanText(anItem.getText(" "))
                print
    printEndLine()

    # comments
    comments = top.findChildren(None, {'class': 'comment-body'})
    for aComment in comments:
        author = aComment.findChild(None, {
            'class': 'comment-author vcard'
        }).getText(" ").rstrip()
        author = author.lstrip()
        date = aComment.findChild(None, {
            'class': 'comment-meta commentmetadata'
        }).getText(" ").rstrip()
        print "On ", date
        print author
        print

        for p in aComment:
            if not isinstance(p, ns):
                if 'p' == p.name[0]:
                    print cleanText(p.getText(" "))
                    print
        printEndLine()

コード例 #17

0

ファイルを表示

ファイル: nfla.py プロジェクト: dijatool/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	title = cleanText( soup.findChild( 'title' ).text )

	print title
	print
	print url
	print

	# grab the text and print all the paragraphs
	text = soup.findChild( None, { 'class' : 'articleText' })

	paras = text.findAll()
	for p in paras :
		if 'p' == p.name[0] or 'h' == p.name[0] :
			childScript = p.findChildren( 'script' )
			if not childScript : 
				print cleanText( p.getText( " " ))
				print

		li = p.findChildren( 'li' )
		if None != li :
			for anItem in li :
				print "o %s" % cleanText( anItem.getText( " " ))
				print

コード例 #18

0

ファイルを表示

ファイル: salon.py プロジェクト: dijatool/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''

	soup = loadPage( url )
	# print soup

	title = cleanText( soup.findChild( 'title' ).text )

	print title
	print
	print url
	print

	main = soup.findChild( None, { "class" : "articleContent" })

	paras = main.findAll()
	for p in paras :
		if 'p' == p.name[0] or 'h' == p.name[0] :
			print cleanText( p.getText( ' ' ))
			print

コード例 #19

0

ファイルを表示

def download(url):
    '''
		Pull the page and parse it into the pieces we need.
	'''

    soup = loadPage(url)
    # print soup

    title = cleanText(soup.findChild('title').text)

    print title
    print
    print url
    print

    main = soup.findChild(None, {"class": "articleContent"})

    paras = main.findAll()
    for p in paras:
        if 'p' == p.name[0] or 'h' == p.name[0]:
            print cleanText(p.getText(' '))
            print

コード例 #20

0

ファイルを表示

ファイル: nfla.py プロジェクト: hslawson/odds-scraper

def download(url):
    '''
		Pull the page and parse it into the pieces we need.
	'''
    cookieJar = cookielib.LWPCookieJar()
    if os.path.isfile(kCookieFile):
        cookieJar.load(kCookieFile)
    else:
        cookieJar.save(kCookieFile)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))

    link = opener.open(url)

    page = link.read()
    soup = bs(page)

    title = cleanText(soup.findChild('title').text)

    print title
    print
    print url
    print

    # grab the text and print all the paragraphs
    text = soup.findChild(None, {'class': 'articleText'})

    paras = text.findAll()
    for p in paras:
        if 'p' == p.name[0] or 'h' == p.name[0]:
            childScript = p.findChildren('script')
            if not childScript:
                print cleanText(p.getText(" "))
                print

        li = p.findChildren('li')
        if None != li:
            for anItem in li:
                print "o %s" % cleanText(anItem.getText(" "))
                print

コード例 #21

0

ファイルを表示

ファイル: ybox.py プロジェクト: dijatool/odds-scraper

def tableToArray( tableSoup ) :
	'''
		tableToArray needs a description...

	'''
	table = []
	rows = tableSoup.findChildren( 'tbody' )[ 0 ].findChildren( 'tr' )
	for row in rows :
		rowArray = []
		tds = row.findChildren( 'td' )
		for td in tds :
			rowArray.append( cleanText( td.getText( ' ' )))
		table.append( rowArray )

	return table

コード例 #22

0

ファイルを表示

ファイル: chtv.py プロジェクト: dijatool/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	title = cleanText( soup.findChild( 'title' ).text )

	print title
	print
	print url
	print

	# grab the text and print all the paragraphs
	top = soup.findChild( None, { 'id' : 'content' })

	text = soup.findChild( None, { 'class' : 'entry clearfix' })

	paras = text.findAll()
	for p in paras :
		if 'p' == p.name[ 0 ] or 'h' == p.name[ 0 ] :
			print cleanText( p.getText( " " ))
			print
		li = p.findChildren( 'li' )
		if None != li :
			for anItem in li :
				print "o %s" % cleanText( anItem.getText( " " ))
				print
	printEndLine()

	# comments
	comments = top.findChildren( None, { 'class' : 'comment-body' })
	for aComment in comments :
		author = aComment.findChild( None, { 'class' : 'comment-author vcard' }).getText( " " ).rstrip()
		author = author.lstrip()
		date = aComment.findChild( None, { 'class' : 'comment-meta commentmetadata' }).getText( " " ).rstrip()
		print "On ", date
		print author
		print

		for p in aComment :
			if not isinstance( p, ns ) :
				if 'p' == p.name[ 0 ] :
					print cleanText( p.getText( " " ))
					print
		printEndLine()

コード例 #23

0

ファイルを表示

def tableToArray(tableSoup):
    '''
		tableToArray needs a description...

	'''
    table = []
    rows = tableSoup.findChildren('tbody')[0].findChildren('tr')
    for row in rows:
        rowArray = []
        tds = row.findChildren('td')
        for td in tds:
            rowArray.append(cleanText(td.getText(' ')))
        table.append(rowArray)

    return table

コード例 #24

0

ファイルを表示

def dumpGameStats( rowsList, year, isPlayoffs = False ) :
	'''
		dumpGameStats needs a description...

		Note that we don't grab the last item in the list, where Yahoo sticks a total

	'''
	finalList = []
	for i, row in enumerate( rowsList[ : -1 ] ) :
		rowClass = None
		try :
			rowClass = row['class']
		except :
			pass
		if 'column' != rowClass :
			dataList = []
			datums = row.findChildren( 'td' )
			if len( datums ) > 0 :
				if isPlayoffs :
					dataList.append( '"%s"' % str( i + 17 ))
				for aDatum in datums :
					dataList.append( '"%s"' % cleanText( aDatum.text ))

				# now we need to repair data in various ways
				date = dataList[ 1 ]
				date = '%s %s"' % ( date[ : -1 ], year )
				dataList[ 1 ] = date

				# split the result, PF and PA into seperate items
				# it may be empty if the game is unplayed
				result = dataList[ 3 ][ 1 : -1 ]
				if "" != result :
					wlPoints = result.split( " " )
					dataList[ 3 ] = '"%s"' % wlPoints[ 0 ]
					points = wlPoints[ 1 ].split( '-' )
					dataList.insert( 4, '"%s"' % points[ 0 ] )
					dataList.insert( 5, '"%s"' % points[ 1 ] )
				else :
					dataList.insert( 4, '""' )
					dataList.insert( 4, '""' )
				finalList.append( dataList )

	for i, aWeek in enumerate( finalList ) :
		if len( aWeek ) > 0 :
			print ",".join( aWeek )

コード例 #25

0

ファイルを表示

ファイル: nyt.py プロジェクト: hslawson/odds-scraper

def download(url):
    '''
		Pull the page and parse it into the pieces we need.
	'''
    cookieJar = cookielib.LWPCookieJar()
    if os.path.isfile(kCookieFile):
        cookieJar.load(kCookieFile)
    else:
        cookieJar.save(kCookieFile)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))

    link = opener.open(url)

    page = link.read()
    soup = bs(page)

    title = cleanText(soup.findChild('title').text)

    print title
    print
    print url
    print

    author = soup.findChildren(None, {'class': 'byline'})
    if author is not None:
        try:
            print cleanText(author[0].getText(" "))
        except:
            pass
    date = soup.findChildren(None, {'class': 'dateline'})
    if date is not None:
        print cleanText(date[0].getText(" "))
    print

    # should find a better starting point... past the end of the first group of items...
    moreItems = soup.findChildren(None, {'itemprop': 'articleBody'})
    if None != moreItems:
        for anItem in moreItems:
            if 'p' == anItem.name[0] or 'h' == anItem.name[0]:
                print cleanText(anItem.getText(" "))
                print

コード例 #26

0

ファイルを表示

ファイル: nyt.py プロジェクト: dijatool/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )

	title = cleanText( soup.findChild( 'title' ).text )

	print title
	print
	print url
	print

	author = soup.findChildren( None, { 'class' : 'byline' })
	if author is not None :
		try :
			print cleanText( author[ 0 ].getText( " " ))
		except :
			pass
	date = soup.findChildren( None, { 'class' : 'dateline' })
	if date is not None :
		print cleanText( date[ 0 ].getText( " " ))
	print

	# should find a better starting point... past the end of the first group of items...
	moreItems = soup.findChildren( None, { 'itemprop' : 'articleBody' })
	if None != moreItems :
		for anItem in moreItems :
			if 'p' == anItem.name[0] or 'h' == anItem.name[0] :
				print cleanText( anItem.getText( " " ))
				print