Example #1
0
	def audit(self, origin, response):
		"""
			:origin: original url.
			all url need match with original url
			:return: list url obj
		"""
		while len(self.QUEUES) > 0:
			url_ = self.QUEUES.pop()
			self.debug("       [*] Crawling URL: " + url_.get_url())  # print debug
			self.RESULTS.append(url_)
			header, response = self.connect_getdata(url_.domain, url_.port, url_.get_module())
			links = self.get_links(response, self.domain, self.port, url_.folder)
			for link in links:
				url = URL(link)
				if not self.is_in_results(url):
					if not self.is_in_queues(url):
						self.QUEUES.insert(0, url)
						self.debug(url.get_url())
						self.debug_socket(url.get_url())
			self.RESULTS = filter(None, self.RESULTS)
Example #2
0
import sys
from DB import DB
from URL import URL

db = DB('citeseerx.db')
db.create_tables()
# db.del_all()

# http://citeseerx.ist.psu.edu/viewdoc/summary?cid=16057
if len(sys.argv) == 2:
    url = URL(sys.argv[1])
    url.open()
    db.insert('link', {'doi': url.get_doi(), 'url': url.get_url()})
else:
    print 'Please supply proper URL.'
Example #3
0
            })

    # add citations
    cit_html = url.get_citations()
    soup = BeautifulSoup(cit_html, "html.parser")
    trs = soup.findAll('tr', {'class': None, 'id': None})
    for tr in trs:
        td = tr.findAll('td')[1]
        a = td.find('a')
        href = a['href']
        if (href.find('viewdoc') >= 0):
            urlt = 'http://citeseerx.ist.psu.edu/viewdoc/summary' + href[
                href.find('?'):]
            urlt = URL(urlt)
            urlt.open()
            print ' -> ', urlt.get_url()
            if (urlt.status_ok()):
                # print tr.find('p', {'class': 'citationContext'})
                if tr.find('p', {'class': 'citationContext'}):
                    context = tr.find('p', {
                        'class': 'citationContext'
                    }).findAll(text=True)[0]
                else:
                    context = ''
                if not db.exists('citations', {
                        'doi_f': url.get_doi(),
                        'doi_t': urlt.get_doi()
                }):
                    db.insert(
                        'citations', {
                            'doi_f': url.get_doi(),