Example #1
0
    def extract(self, response):
	crawled = self.get_crawled()

	sel = Selector(response)
	trs = sel.xpath('/html/body/table[1]/tr/td[3]/table[2]/tr[1]/td/form/table[2]/table/table/tr')

	for tr in trs[3:-6]:#-6
		#try:
			item = ProblemItem()
			tds = tr.xpath('.//td[@class="statText"]')
			item['name'] = extract_text(tds[0].xpath('./a/text()'))
			item['href'] = urljoin(index_url, extract_text(tds[0].xpath('./a/@href')))
			item['problem_id'] = {param.split('=')[0]: param.split('=')[1] for param in urlparse(item['href']).query.split('&')}['pm']
			match_href = extract_text(tds[1].xpath('./a/@href'))
			item['match_id'] = {param.split('=')[0]: param.split('=')[1] for param in urlparse(match_href).query.split('&')}['rd']
			item['date'] = extract_text(tds[2].xpath('./text()'))
			item['writer'] = extract_text(tds[3].xpath('./a/text()'))
			item['categary'] = extract_text(tds[4].xpath('./text()'))
			item['level_div1'] = extract_text(tds[5].xpath('./text()'))
			item['rate_div1'] = extract_text(tds[6].xpath('./text()'))
			item['level_div2'] = extract_text(tds[7].xpath('./text()'))
			item['rate_div2'] = extract_text(tds[8].xpath('./text()'))
			
			if item['match_id'] in crawled:
				break
			yield Request(url=item['href'], callback=self.crawl_content, meta={'item':item})
Example #2
0
    def crawl_content(self, response):
	problem_id = response.meta['problem_id']
	code_id = response.meta['code_id']
	solution_id = response.meta['solution_id']
	match_id = response.meta['match_id']

	languages = ['Java', 'C++', 'C#', 'VB', 'Python']

	item = SolutionItem()
	item['problem_id'] = problem_id
	item['match_id'] = match_id
	item['solution_id'] = solution_id
	item['language'] = languages[code_id]
	item['status'] = 'Success'
	sel = Selector(response)
	trs = sel.xpath('/html/body/table[1]/tr/td[3]/table[2]/tr[1]/td/table[2]').xpath('.//tr')
	code_tr = None
	for (i, tr) in enumerate(trs):
		td0 = tr.xpath('./td[1]/@class')
		if extract_text(td0) == 'problemText':
			code_tr = tr
			break
	if code_tr is None:
		log.msg('No code found! Please check match_id: %s, problem_id: %s, solution_id: %s!' % (match_id, problem_id, solution_id), level=log.WARNING)
		item['status'] = 'Failed!1'
		return item
		
	content = extract_text(code_tr.xpath('./td'))

	import io
	f = io.open(join(code_path, problem_id + '_' + languages[code_id]), 'w', encoding='utf8')
	f.write(content)
	f.close()
	return item
Example #3
0
    def crawl_content(self, response):
	item = response.meta['item']
	sel = Selector(response)
	content = extract_text(sel.xpath('/html/body/table[1]/tr/td[3]/table[2]/tr[1]/td/table/tr[6]/td'))
	item['content'] = join(problem_content_path, str(item['problem_id']))
	import io
	f = io.open(item['content'], 'w', encoding='utf8')
	f.write(content)
	f.close()
	return item
    def extract(self, response):
        items = []
        sel = Selector(response)
        trs = sel.xpath('/html/body/table[1]/tr/td[3]/table[3]/tr')
        for tr in trs[3:]:
            tds = tr.xpath('.//td')

            item = MatchDetailItem()
            item['match_id'] = response.meta['match_id']
            item['division'] = 1
            item['username'] = extract_text(tds[1].xpath('./a/text()'))
            item['score'] = extract_text(tds[2].xpath('./text()'))
            item['place'] = extract_text(tds[3].xpath('./text()'))
            items.append(item)

            if (len(tds) < 6):
                continue

            item = MatchDetailItem()
            item['match_id'] = response.meta['match_id']
            item['division'] = 2
            item['username'] = extract_text(tds[6].xpath('./a/text()'))
            item['score'] = extract_text(tds[7].xpath('./text()'))
            item['place'] = extract_text(tds[8].xpath('./text()'))
            items.append(item)
        return items
Example #5
0
    def extract(self, response):
	sel = Selector(response)
	trs = sel.xpath('/html/body/table[1]/tr/td[3]/div/table[3]').xpath('.//tr')
	code_tr = None
	for (i, tr) in enumerate(trs):
		td0 = tr.xpath('./td[1]/text()')
		row_name = extract_text(td0)
		if row_name == 'Top Submission':
			code_tr = tr
			break

	if code_tr is None:
		log.msg('No Top Submission Row found! Please check match_id: %s problem_id: %s!' % (response.meta['match_id'], response.meta['problem_id']), level=log.WARNING)
		return

	tds = code_tr.xpath('.//td')
	for (i, td) in enumerate(tds[1:-1]):
		href = urljoin(index_url, extract_text(td.xpath('./a/@href')))
		solution_id = get_param(href, 'cr')
		href = problem_solution_url % (solution_id, response.meta['match_id'], response.meta['problem_id'])
		if solution_id is None:
			continue 
		d = {'match_id':response.meta['match_id'], 'problem_id':response.meta['problem_id'], 'solution_id': solution_id, 'code_id':i}
		yield Request(url=href, callback=self.crawl_content, meta=d)
Example #6
0
 def classify(self, html):
     text = tools.extract_text(html)
     X = self._vocabulary.transform([text])
     y = self._classifier.predict(X)[0]
     return y