Example #1
0
def cleanUp():
    '''Delete existing session and cleanup the database'''
    global conn
    print("\nAre you SURE you want to perform a clean up?")
    print("This will STOP and DELETE any running session.")
    s = input("Type 'YES' to confirm: ")
    if (s.strip() == "YES"):
        # connect to the DB if not already connected
        connect()
        print(utils.clean_message())
        sql.clean(conn)
        end_session(2)
    else:
        print("\nNo changes made. Exiting.")
        end_session(2)
def get_links(soup, url, re=r'^.*$'):
    if not soup:
        return set([])
    return set([
        clean(urllib2.urlparse.urljoin(url, a.attrs.get('href')))
        for a in soup.findAll('a')
    ])
def stem(p):
    # p = re.match(r"^.*?[A-Za-z0-9']")
    m = re.match(r"^\s*[<(]*(.*?)[.,?!:)>/]*\s*$", p)
    if m:
        p = m.groups()[0]
    p = porter.stem(p, 0, len(p) - 1).upper()[:25]
    p = clean(p)
    return p
def crawl(root, regex=r'^.*$', level=1, quiet=False):
    cursor = db.cursor()

    RootPageID = select_or_insert(db, 'Webpage', url=root,
                                  quiet=quiet)[0]['WID']
    CrawlID = insert1(db, 'Crawl', rootwid=RootPageID, nLevels=level)[0]['CID']

    soup = mine(root, cid=CrawlID, wid=RootPageID, quiet=quiet, regex=regex)

    discovered = get_links(soup, root)
    discovered = filter(lambda link: re.match(regex, link), discovered)
    discovered_wids = set([])
    for url in discovered:
        curl = clean(url)
        discovered_wids.add(
            select_or_insert(db, 'Webpage', url=str(curl))[0]['WID'])
    discovered_wids = list(discovered_wids)
    discovered_wids.sort()
    discovered_wids = [str(wid) for wid in discovered_wids]
    if len(discovered_wids):
        query = 'UPDATE Webpage SET newCID={cid} WHERE wid IN ({wids});'.format(
            cid=CrawlID, wids=','.join(discovered_wids))
        if not quiet:
            print query
        cursor.execute(query)
    else:
        print '-- No links'
    insert(db, 'Link', [{
        'fromWID': RootPageID,
        'toWID': url
    } for url in discovered_wids])

    while level > 0:
        query = 'SELECT wid, url FROM Webpage WHERE newCID = {cid} AND mined=False;'.format(
            cid=CrawlID)
        if not quiet:
            print query
        cursor = db.cursor()
        cursor.execute(query)
        for wid, url in cursor:
            # print url
            mine(url, cid=CrawlID, wid=wid, quiet=quiet, regex=regex)
            # crawl(url,level-1,CrawlID,quiet=quiet)
        level -= 1
        print '-- Level:', level
    query = 'UPDATE Crawl SET endtime="{now}" WHERE cid = {cid};'.format(
        cid=CrawlID, now=datetime.datetime.now())
    if not quiet:
        print query
    cursor.execute(query)
    db.close()
def crawl(root,regex=r'^.*$',level=1,quiet=False):

	RootPageID = Webpage.select_or_insert(url=root)['WID']
	CrawlID = Crawl.insert1(rootwid=RootPageID, nLevels=level, access=True)['CID']
	# exit()

	soup = mine(root, cid=CrawlID, wid=RootPageID, quiet=quiet, regex=regex)

	discovered = get_links(soup, root)
	discovered = filter(lambda link: re.match(regex, link), discovered)
	discovered_wids = set([])
	for url in discovered:
		curl = clean(url)
		discovered_wids.add(Webpage.select_or_insert(url=str(curl))['WID'])
	discovered_wids = list(discovered_wids)
	discovered_wids.sort()
	discovered_wids = [ str(wid) for wid in discovered_wids ]
	if len(discovered_wids):
		UpdateWebpage = db.cursor()
		query = 'UPDATE Webpage SET newCID={cid} WHERE wid IN ({wids});'.format(cid=CrawlID, wids=','.join(discovered_wids))
		if not quiet:
			print query
		UpdateWebpage.execute(query)
		UpdateWebpage.close()
		consolidate_all_webpages()
	else:
		print '-- No links'
	Link.insertlod([ {'fromWID':RootPageID, 'toWID':url} for url in discovered_wids ])

	while level > 0:
		breakpoint(CrawlID)
		SelectURL = db.cursor()
		query = 'SELECT wid, url FROM Webpage WHERE newCID = {cid} AND mined=False;'.format(cid=CrawlID)
		if not quiet:
			print query
		SelectURL.execute(query)
		for wid, url in SelectURL:
			mine(url, cid=CrawlID, wid=wid, quiet=quiet, regex=regex)
			breakpoint(CrawlID)
		level -= 1
		print '-- Level:', level
		SelectURL.close()
	FinalUpdate = db.cursor()
	query = 'UPDATE Crawl SET endtime="{now}" WHERE cid = {cid};'.format(cid=CrawlID, now=datetime.datetime.now())
	if not quiet:
		print query
	FinalUpdate.execute(query)
	FinalUpdate.close()
def stem(p):
	# print p
	m = re.match(r"^.*?([A-Za-z0-9']+).*$", p)
	# m = re.match(r"^\s*[<(]*(\w*?)[.,?!:)>/]*\s*$", p)
	if m:
		# print m.groups()
		p = m.groups()[0]
	p = porter.stem(p, 0, len(p)-1).upper()
	p = clean(p)
	if '\\' in p or max([ ord(c) for c in p ] + [0]) > 127:
		return ''
	# if '.' in p:
	# 	return stem(re.match(r"^(.*?)\.", p).groups()[0].lower())
	# # 	return stem(re.match(r"^(.*?)\xe2", p).groups()[0].lower())
	# ultraascii = re.match(r"^(.*?)[^[:ascii:]]", p)
	# if ultraascii:
	# 	return stem(ultraascii.groups()[0].lower())
	p = p.replace('"','')
	return p[:25]