def test_one(url): myHTML = HTMLObject(); myHTML.getHTML(url); parser = myParser(myurl = url, domain = getDomain(url)); parser.parse(myHTML.page); styled_objs, no_style_objs = main_style_parse(parser.styled_objects, parser.full_style, True); for key in styled_objs.keys(): print key + ' ==> ' + str(styled_objs[key]);
class MyThread(threading.Thread): def __init__(self): self.status = ''; self.url = ''; self.myHTML = HTMLObject(); threading.Thread.__init__(self); def run(self): self.myHTML.getHTML(self.url); return self.myHTML.page; def status_to(self, string): self.status = string;
#! /usr/local/bin/python # webcrawler and spider.add, a* algos # Command line interface for HTMLObject, reads from standardin from HTMLObject import HTMLObject from sys import stdin, stdout args = stdin.readline() html = HTMLObject(args) links = html.get_links() args = args[:-1] for a, b, link, d in links: if link[0:4] == "http": print link else: print args + link
def __init__(self): self.status = ''; self.url = ''; self.myHTML = HTMLObject(); threading.Thread.__init__(self);
result = cursor.fetchall(); try: site.url = result[0][0]; dont_follow = False; for link in dont_follow_links: if(re.search(r'^'+link, site.url)): print 'not going to:', site.url; cursor.execute("UPDATE sites_sitequeue set crawled = 1, status=401 WHERE id = %d;" %result[0][1]); dont_follow = True; continue; if(dont_follow): continue; except Exception, e: print 'error 60:', e; break; myHTML = HTMLObject(); links = []; doc_id = result[0][1]; site.domain = result[0][2]; try: if(not myHTML.getHTML(site.url)): cursor.execute('UPDATE sites_sitequeue set status=400 WHERE id = %d;' %doc_id); conn.commit(); conn.close(); #cursor.excute(" continue; except Exception, e: cursor.execute('UPDATE sites_sitequeue set status=400 WHERE id = %d;' %doc_id); conn.commit();
# TestFile from __future__ import print_function from HTMLObject import HTMLObject html = HTMLObject("http://lxml.de/lxmlhtml.html#working-with-links") links = html.get_links() print(html.get_links())
if(re.search(r'^'+link, site.url)): print 'not going to:', site.url; myDBConn.cursor.execute("UPDATE sites_sitequeue set crawled = 1, status=401 WHERE id = %d;" %result[0][1]); dont_follow = True; continue; if(dont_follow): continue; except Exception, e: logger.info('sleeping for 10 seconds to find if there are more sites coming.'); sleep(10); myDBConn.commit(); myDBConn.close(); myDBConn = databaseConn(); continue; #break; myHTML = HTMLObject(); links = []; doc_id = result[0][1]; site.domain = result[0][2]; try: if(not myHTML.getHTML(site.url)): myDBConn.cursor.execute('UPDATE sites_sitequeue set status=400 WHERE id = %d;' %doc_id); continue; except Exception, e: myDBConn.cursor.execute('UPDATE sites_sitequeue set status=400 WHERE id = %d;' %doc_id); continue; ##if we get here we are not blocked....yet.....reset the blocked counter if applicable## am_i_blocked_counter = 0;