def get_player_var_set(url, var_set):
    tmp_html = web.URL(url).download(cached=False)
    regM = re.search('href="(/Matches.{1,128}?)".{1,10}Match Centre', tmp_html,
                     re.DOTALL)
    match_centre_url = 'http://www.whoscored.com' + regM.group(1)
    player_stats_url = match_centre_url.replace('Live', 'LiveStatistics')

    html = web.URL(player_stats_url).download(cached=False)
    regM = re.search('var initialData = (.*?);', html, re.DOTALL)
    data = regM.group(1)

    while ',,' in data:
        data = data.replace(',,', ",' ',")

    data = ast.literal_eval(data)

    match_overview = data[0][0]
    match_details = data[0][1]

    print match_overview[2], match_overview[3]

    for team in match_details:
        player_stats = team[4]

        for p in player_stats:
            for var in p[3][0]:
                var_set.add(var[0])
Example #2
0
def get_links_from_page(number_of_pages): 
	# get initial url 
	url = web.URL('http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1990,2012')
	# create an empty array to populate with the urls 
	pages = []
	# the loop to get the links 
	for page_index in range(number_of_pages): 
		if page_index == 0: # the first page has only next button so the DOM is different
			dom = web.DOM(url.download(cached = False))
			# to see which part of the DOM to use right click in Chrome 
			# and use 'Inspect Element'
			entry = dom('span.pagination')[1].by_tag('a')
			href = 'http://www.imdb.com/' + entry[0].attributes.get('href')
			pages.append(href)
			print(pages)
			url = web.URL(href)
		else:  # after the first page you have both previous and next butoon so you select next 
			dom = web.DOM(url.download(cached = False))
			entry = dom('span.pagination')[1].by_tag('a')
			href = 'http://www.imdb.com/' + entry[1].attributes.get('href')
			pages.append(href)
			print(pages)
			url = web.URL(href)
	# return a list that handles empty urls
	return list(set(pages))
def scrape_1_year_data_of_1_league(seed_url, start_week_num, num_weeks):
    # seed_url = 'http://www.whoscored.com/Regions/252/Tournaments/2/Seasons/1849'
    # start_week_num = 30
    # num_weeks = 48

    html = web.URL('https://www.whoscored.com/Accounts/').download(
        cached=False, user_agent='Mozilla/5.0')

    regM = re.search('<h2 class="tournament-tables-header">(.*?)</h2>', html,
                     re.DOTALL)
    if regM is not None:
        league_name = regM.group(1).replace('Tables', '').strip()

    if league_name == 'Primera Division':
        league_name = 'LIGA BBVA'

    regM = re.search('<div id="sub-navigation".*?>.*?<a href="(.*?)"', html,
                     re.DOTALL)
    regM = re.search('Stages/(\d+)/', regM.group(1), re.DOTALL)
    league_season_id = regM.group(1)

    regM = re.search("min = new Date\((\d+),", html, re.DOTALL)
    start_year = int(regM.group(1))

    # outfile = open('premier-league-2013-2014.txt', 'wb')
    # var_set = Set()
    conn = sqlite3.connect('whoscored.db')
    db_cur = conn.cursor()
    for i in range(start_week_num, start_week_num + num_weeks):
        y = start_year + i / 52
        w = i % 52
        ajax_request_url = 'http://www.whoscored.com/tournamentsfeed/' + league_season_id + '/Fixtures/?d=' + str(
            y) + 'W' + str(w) + '&isAggregate=false'
        ajax_return_str = web.URL(ajax_request_url).download(cached=False)
        matches_of_week = ast.literal_eval(ajax_return_str)

        if matches_of_week != '[]':
            for m in matches_of_week:
                match_id = m[0]
                match_url = 'http://www.whoscored.com/Matches/' + str(
                    match_id) + '/MatchReport'
                print str(y) + '-W' + str(w) + ': ', match_url
                scrape_single_match(
                    league_name,
                    str(start_year) + '-' + str(start_year + 1), match_url,
                    match_id, db_cur, conn)

                # outfile.write(url+os.linesep)
                # get_player_var_set(url, var_set)

    conn.close()
Example #4
0
def get_data_from_pages(links): 
	# open an empty array 
	data = []
	#create the loop to get the links that you created from the previous function
	for urltext in links: 
		#parse the url 
		url = web.URL(urltext)
		# print them for "matrix" like effect (slower, comment this line if you do not want it)
		print "Getting data from: ", url 
		try:  # the main scraping loop, it all about DOM manipulation 
			# learn more about DOM at http://code.tutsplus.com/tutorials/javascript-and-the-dom-series-lesson-1--net-3134 
			dom = web.DOM(url.download(cached=False))
			for movie in dom.by_tag('td.title'):
				title = movie.by_tag('a')[0].content
				print title
				genres = movie.by_tag('span.genre')[0].by_tag('a')
				genres = [g.content for g in genres]
				print genres
				director = movie.by_tag('span.credit')[0].by_tag('a')[0].content
				print director
				first_actor = movie.by_tag('span.credit')[0].by_tag('a')[1].content
				print first_actor
				second_actor = movie.by_tag('span.credit')[0].by_tag('a')[2].content
				print second_actor
				runtime = movie.by_tag('span.runtime')[0].content
				print runtime
				rating = movie.by_tag('span.value')[0].content
				print rating
				data.append((title, genres, director, first_actor, second_actor, runtime, rating))	
		except KeyboardInterrupt:
			break # to be able to interrupt the Ctrl+c without losing the data
		except: 
			pass # to not stop in case of missing data 
	return data
Example #5
0
 def test_url_query(self):
     # Assert URL.query and URL.querystring.
     v = web.URL(self.url)
     v.query["page"] = 10
     v.query["user"] = None
     self.assertEqual(v.query, {"q": 1, "page": 10, "user": None})
     self.assertEqual(v.querystring, "q=1&page=10&user="******"ünîcødé": 1.5}, "%C3%BCn%C3%AEc%C3%B8d%C3%A9=1.5")
     v.query = q[0]
     self.assertEqual(v.querystring, q[1])
     # Assert URL.query decodes unicode arguments.
     v = web.URL("http://domain.com?" + q[1])
     self.assertEqual(v.query, q[0])
     print "pattern.web.URL.query"
     print "pattern.web.URL.querystring"
Example #6
0
 def test_url_string(self):
     # Assert URL._set_string().
     v = web.URL("")
     v.string = "https://domain.com"
     self.assertEqual(v.parts[web.PROTOCOL], "https")
     self.assertEqual(v.parts[web.DOMAIN],   "domain.com")
     self.assertEqual(v.parts[web.PATH],     [])
     print "pattern.web.URL.string"
Example #7
0
 def _test_search_image_size(self, api, source, license, Engine):
     # Assert image URL's for different sizes actually exist.
     if api == "Yahoo" and license == ("",""): 
         return
     e = Engine(license, throttle=0.25)
     for size in (web.TINY, web.SMALL, web.MEDIUM, web.LARGE):
         v = e.search("cats", type=web.IMAGE, count=1, size=size, cached=False)
         self.assertEqual(web.URL(v[0].url).exists, True)
         print "pattern.web.%s.search(type=IMAGE, size=%s)" % (api, size.upper())
Example #8
0
 def test_url_open(self):
     # Assert URLError.
     v = web.URL(self.live.replace("http://", "htp://"))
     self.assertRaises(web.URLError, v.open)
     self.assertEqual(v.exists, False)
     # Assert HTTPError.
     v = web.URL(self.live + "iphone/android.html")
     self.assertRaises(web.HTTPError, v.open)
     self.assertRaises(web.HTTP404NotFound, v.open)
     self.assertEqual(v.exists, False)
     # Assert socket connection.
     v = web.URL(self.live)
     self.assertTrue(v.open() != None)
     self.assertEqual(v.exists, True)
     # Assert user-agent and referer.
     self.assertTrue(v.open(user_agent=web.MOZILLA, referrer=web.REFERRER) != None)
     print "pattern.web.URL.exists"
     print "pattern.web.URL.open()"
Example #9
0
 def test_url_download(self):
     t = time.time()
     v = web.URL(self.live).download(cached=False, throttle=0.25, unicode=True)
     t = time.time() - t
     # Assert unicode content.
     self.assertTrue(isinstance(v, unicode))
     # Assert download rate limiting.
     self.assertTrue(t >= 0.25)
     print "pattern.web.URL.download()"
Example #10
0
 def test_url_parts(self):
     # Assert URL._parse and URL.parts{}.
     v = web.URL(self.url)
     for a, b in ((web.PROTOCOL, self.parts["protocol"]),
                  (web.USERNAME, self.parts["username"]),
                  (web.PASSWORD, self.parts["password"]),
                  (web.DOMAIN, self.parts["domain"]), (web.PORT,
                                                       self.parts["port"]),
                  (web.PATH, self.parts["path"]), (web.PAGE,
                                                   self.parts["page"]),
                  (web.QUERY, self.parts["query"]), (web.ANCHOR,
                                                     self.parts["anchor"])):
         self.assertEqual(v.parts[a], b)
     print "pattern.web.URL.parts"
Example #11
0
 def test_url(self):
     # Assert URL.copy().
     v = web.URL(self.url)
     v = v.copy()
     # Assert URL.__setattr__().
     v.username = "******"
     v.password = "******"
     # Assert URL.__getattr__().
     self.assertEqual(v.method,   web.GET)
     self.assertEqual(v.protocol, self.parts["protocol"])
     self.assertEqual(v.username, "new-username")
     self.assertEqual(v.password, "new-password")
     self.assertEqual(v.domain,   self.parts["domain"])
     self.assertEqual(v.port,     self.parts["port"])
     self.assertEqual(v.path,     self.parts["path"])
     self.assertEqual(v.page,     self.parts["page"])
     self.assertEqual(v.query,    self.parts["query"])
     self.assertEqual(v.anchor,   self.parts["anchor"])
     print "pattern.web.URL"
    #     data = data.replace(',,', ",' ',")

    # data = ast.literal_eval(data)

    # for p in data[0][1][0][4]:
    #     print p[0:3], p[4:]
    # end of test

    # return html


seed_url = 'http://www.whoscored.com/Regions/252/Tournaments/2/Seasons/3853'
start_week_num = 69
num_weeks = 10

html = web.URL(seed_url).download(cached=False)

regM = re.search("<h1>(.*?)</h1>", html, re.DOTALL)
league_name = regM.group(1)

regM = re.search("img/customstageheaders/(\d+)\.jpg", html, re.DOTALL)
league_season_id = regM.group(1)

regM = re.search("min = new Date\((\d+),", html, re.DOTALL)
start_year = int(regM.group(1))

# outfile = open('premier-league-2013-2014.txt', 'wb')
# var_set = Set()
conn = sqlite3.connect('whoscored.db')
db_cur = conn.cursor()
for i in range(start_week_num, start_week_num + num_weeks):
Example #13
0
 def test_url_redirect(self):
     # Assert URL redirected URL (this depends on where you are).
     # In Belgium, it yields "http://www.google.be/".
     v = web.URL(self.live).redirect
     print "pattern.web.URL.redirect: " + self.live + " => " + str(v)
Example #14
0
 def test_url_headers(self):
     # Assert URL headers.
     v = web.URL(self.live).headers["content-type"].split(";")[0]
     self.assertEqual(v, "text/html")
     print "pattern.web.URL.headers"
Example #15
0
 def test_url_mimetype(self):
     # Assert URL MIME-type.
     v = web.URL(self.live).mimetype
     self.assertTrue(v in web.MIMETYPE_WEBPAGE)
     print "pattern.web.URL.mimetype"
Example #16
0
def extract_data(package):
    (page, query) = package

    print "Checking %s" % page
    new_webpage = Webpage()
    new_webpage.url = page
    try:
        url = web.URL(page)
        mimetype = url.mimetype
        new_webpage.mimetype = mimetype
        print "Checking mimetype..."
        if mimetype == 'text/html':
            print "Mimetype ok (text/html)"  #only load Webpages!!!
            domain = url.domain  # u'domain.com'
            url_feed = ''
            redirected_page = url.redirect  # Actual URL after redirection, or None.
            path = url.path  # [u'path']
            # different options to open a webpage
            print "Opening %s" % page
            html = url.download(user_agent=choice(user_agents), cached=False)
            #html = urllib2.urlopen(page).read()
        else:
            print 'Wrong mimetype (not text/html)'
            new_webpage.successful_open = True
    except:
        print "Could not open page: %s" % page
        new_webpage.successful_open = False
    try:
        if check_query(query, str(
                html)):  #on s'assure d'abord que ça roule pour le full html
            new_webpage.successful_open = True

            dom = web.Document(html)
            try:
                title = dom.by_tag('title')[0]
                title = repr(web.plaintext(title.content))
                print "Setting page title to %s" % title
            except:
                print "No title found for %s" % page
                title = ''

            #two methods for charset detection:
            charset = None
            # option to detect page encoding from dom structure => does not seem to work utf-8 systematically retrieved...???
            # try:
            # 	metas=dom.by_tag('meta')
            # 	charset=looking4charset(metas)
            # 	print 'charset',charset, 'in page',page
            # except:
            # 	charset=None
            #

            # chardet library use
            # if charset==None:
            # 	encoding = chardet.detect(html)
            # 	html=html.decode(encoding['encoding'])
            # else:
            # 	html=html.decode(charset)

            query_result, text_summary, html_summary = check_page_against_query(
                html, title, query)
            # charset guess can be used to decode results
            # if charset==None:
            # 	encoding = chardet.detect(html)
            # 	html=html.decode(encoding['encoding'])
            # else:
            # 	html=html.decode(charset)

            #save in a repertory output textual summaries
            #fileout=open('temp/'+page[7:20]+'.htm','w')
            #print 'temp/'+page+'.htm'
            #fileout.write(html_summary)
            #fileout.close()

            #if query_result:
            # dom = web.Document(html_summary)
            # try:
            # 	date = dom.by_tag('date')[0]
            # 	date = repr(plaintext(date.content))
            # except:
            # 	date=''
            # print '######date',date
            dateregexp = re.compile(r'(\d{4})-|\\(\d{2})-|\\(\d{2})')

            date = ''
            if not redirected_page == None:
                print 'plus redirection: ', redirected_page
                try:
                    date = dateregexp.search(redirected_page).groups()
                    new_webpage.date = '-'.join(date)
                except:
                    pass
            else:
                try:
                    date = dateregexp.search(page).groups()
                    new_webpage.date = '-'.join(date)
                except:
                    pass
            #print '#############date',date

            if date == '':
                date_txt = pattern_date_fr.search(str(text_summary))
                if not date_txt == None:
                    date = date_txt.groups()
                    new_webpage.date = '-'.join(date)
            #date_txt=pattern_date_fr.search("Samedi 6 août 2011606/08/Août/201120:29")
            if query_result:
                try:
                    print 'page: ', new_webpage.url, ' with title: ', title, ' and date', new_webpage.date, 'was assessed as ', query_result
                except:
                    pass
            #print 'date_txt'
            #print 'date_txt:',str(date_txt)
            #feed webpage details with informations
            new_webpage.url_redirected = redirected_page
            new_webpage.html = html
            new_webpage.html_summary = html_summary
            new_webpage.text_summary = text_summary
            new_webpage.domain = domain
            new_webpage.query_result = query_result
            new_webpage.url_feed = url_feed
            new_webpage.path = path
            new_webpage.charset = charset
            new_webpage.title = title
            new_webpage.opened = new_webpage.opened + 1
            new_webpage.md5 = hashlib.sha224(text_summary).hexdigest()
            new_webpage.text_html = web.plaintext(html,
                                                  keep=[],
                                                  replace=web.blocks,
                                                  linebreaks=2,
                                                  indentation=False)
            #new_webpage.display_page()
            #new_webpage.links=None
        else:
            #the query is not even in the raw html
            new_webpage.successful_open = True
            new_webpage.query_result = False
    except:
        #print "*** Could not extract data from %s" % page
        pass
    return new_webpage
Example #17
0
		corpus_out = '/'.join(path.split('/')[:-1]) + '/'+query
		print corpus_out
		unzip_file_into_dir(path,corpus_out)
		path=corpus_out
		print 'Path: ',path

if seeks_search == 1:
	print "Seeks search enabled. Creating Seeks file in %s" % path
	make_seeds(query,path,nb_results=nb_results)

dirList=os.listdir(path)
print 'List of files in path: ',dirList
for fname in dirList[:]:
	pagelist =os.path.join(path,fname)
	try:
		url=web.URL(pagelist)
		chaine=url.download(cached=False)
		new_urls = map(lambda x: url_uniformer(x.split('">')[0]),web.find_urls(chaine, unique=True))
		if 'Google Search' in pagelist:
			 new_urls = map(lambda x:x.split("&amp;")[0],new_urls)
		for new_url in new_urls[:]:
			print "Checking for forbidden URL..."
			if not check_forbidden((new_url,'')) and not new_url in pages:
				pages[new_url]=inlinks_min
	except:
		pass
print 'Pages init: ', len(pages)
print 'Pages: ', pages

print "Naming database..."
db_name=os.path.join(result_path,query+'_crawl.db')
def scrape_single_match(league_name, season, url, match_id, db_cur, conn):
    tmp_html = web.URL(url).download(cached=False)
    regM = re.search('href="(/Matches.{1,128}?)".{1,10}Match Centre', tmp_html,
                     re.DOTALL)
    match_centre_url = 'http://www.whoscored.com' + regM.group(1)
    player_stats_url = match_centre_url.replace('Live', 'LiveStatistics')

    html = web.URL(player_stats_url).download(cached=False)
    regM = re.search('var initialData = (.*?);', html, re.DOTALL)
    data = regM.group(1)

    while ',,' in data:
        data = data.replace(',,', ",' ',")

    data = ast.literal_eval(data)

    match_overview, match_details = data[0][0:2]
    match_time = match_overview[4]

    print match_overview[4], ':  ', match_overview[2], ' vs. ', match_overview[
        3], ',  ', match_overview[12]

    for idx, team in enumerate(match_details):
        is_home_team = 'Y' if idx == 0 else 'N'
        team_id = team[0]
        team_name = team[1]

        player_stats = team[4]

        for player in player_stats:
            sql_stat = 'replace into player_stats(league_name,season,match_id,match_time,is_home_team,team_id,team_name,player_id,player_name,player_score,pos_category,pos_cur_match,substitution_flag,substitution_minute'

            player_id = player[0]
            player_name, player_score = player[1:3]

            pos_category, pos_cur_match = player[4:6]
            substitution_flag, substitution_minute = player[7:9]

            player_stats_detail = player[3][0]
            var_name_list = [var[0] for var in player_stats_detail]

            # check if there is variable doesn't exist in current table. If not, add new column.
            var_name_set = set(var_name_list)
            table_schema = db_cur.execute(
                'PRAGMA table_info(player_stats);').fetchall()
            existing_var_list = [row[1] for row in table_schema]
            existing_var_set = set(existing_var_list)

            if not var_name_set.issubset(existing_var_set):
                new_vars = var_name_set.difference(existing_var_set)
                for v in new_vars:
                    db_cur.execute(
                        'alter table player_stats add column {0} float;'.
                        format(v))
                    print 'Add New Variable: {0}'.format(v)

            var_name_str = ',' + ','.join(var_name_list)
            var_value_str = ',' + ','.join([
                "'" + var[1][0] +
                "'" if isinstance(var[1][0], str) else str(var[1][0])
                for var in player_stats_detail
            ])

            sql_stat = sql_stat + var_name_str + ") values('{0}','{1}',{2},'{3}','{4}',{5},'{6}',{7},'{8}',{9},{10},'{11}',{12},{13}".format(
                league_name.replace("'", " "), season, match_id, match_time,
                is_home_team, team_id, team_name.replace("'", " "), player_id,
                player_name.replace(
                    "'", " "), player_score, pos_category, pos_cur_match,
                substitution_flag, substitution_minute) + var_value_str + ");"

            db_cur.execute(sql_stat)

    conn.commit()