コード例 #1
0
ファイル: uummuuObjects.py プロジェクト: DavidBrear/UUMMUU
def test_one(url):
    myHTML = HTMLObject();
    myHTML.getHTML(url);
    parser = myParser(myurl = url, domain = getDomain(url));
    parser.parse(myHTML.page);
    styled_objs, no_style_objs = main_style_parse(parser.styled_objects, parser.full_style, True);
    
    for key in styled_objs.keys():
        print key + ' ==> ' + str(styled_objs[key]);
コード例 #2
0
ファイル: myParser.py プロジェクト: DavidBrear/UUMMUU
class MyThread(threading.Thread):

    def __init__(self):
        self.status = '';
        self.url = '';
        self.myHTML = HTMLObject();
        threading.Thread.__init__(self);
        
    def run(self):
        
        self.myHTML.getHTML(self.url);
        return self.myHTML.page;

    def status_to(self, string):
        self.status = string;
コード例 #3
0
#! /usr/local/bin/python
# webcrawler and spider.add, a* algos
# Command line interface for HTMLObject, reads from standardin

from HTMLObject import HTMLObject
from sys import stdin, stdout

args = stdin.readline()

html = HTMLObject(args)
links = html.get_links()
args = args[:-1]

for a, b, link, d in links:
	if link[0:4] == "http":
		print link
	else:
		print args + link
コード例 #4
0
ファイル: myParser.py プロジェクト: DavidBrear/UUMMUU
 def __init__(self):
     self.status = '';
     self.url = '';
     self.myHTML = HTMLObject();
     threading.Thread.__init__(self);
コード例 #5
0
ファイル: urlReader.py プロジェクト: DavidBrear/UUMMUU
 result = cursor.fetchall();
 try:
     site.url = result[0][0];
     dont_follow = False;
     for link in dont_follow_links:
         if(re.search(r'^'+link, site.url)):
             print 'not going to:', site.url;
             cursor.execute("UPDATE sites_sitequeue set crawled = 1, status=401 WHERE id = %d;" %result[0][1]);
             dont_follow = True;
             continue;
     if(dont_follow):
         continue;
 except Exception,  e:
     print 'error 60:', e;
     break;
 myHTML = HTMLObject();
 links = [];
 
 doc_id = result[0][1];
 site.domain = result[0][2];
 try:
     if(not myHTML.getHTML(site.url)):
         cursor.execute('UPDATE sites_sitequeue set status=400 WHERE id = %d;' %doc_id);
         conn.commit();
         conn.close();
         #cursor.excute("
         continue;
         
 except Exception,  e:
     cursor.execute('UPDATE sites_sitequeue set status=400 WHERE id = %d;' %doc_id);
     conn.commit();
コード例 #6
0
# TestFile
from __future__ import print_function
from HTMLObject import HTMLObject

html = HTMLObject("http://lxml.de/lxmlhtml.html#working-with-links")
links = html.get_links()
print(html.get_links())
コード例 #7
0
ファイル: urlReader.py プロジェクト: DavidBrear/UUMMUU
         if(re.search(r'^'+link, site.url)):
             print 'not going to:', site.url;
             myDBConn.cursor.execute("UPDATE sites_sitequeue set crawled = 1, status=401 WHERE id = %d;" %result[0][1]);
             dont_follow = True;
             continue;
     if(dont_follow):
         continue;
 except Exception,  e:
     logger.info('sleeping for 10 seconds to find if there are more sites coming.');
     sleep(10);
     myDBConn.commit();
     myDBConn.close();
     myDBConn = databaseConn();
     continue;
     #break;
 myHTML = HTMLObject();
 links = [];
 
 doc_id = result[0][1];
 site.domain = result[0][2];
 try:
     if(not myHTML.getHTML(site.url)):
         myDBConn.cursor.execute('UPDATE sites_sitequeue set status=400 WHERE id = %d;' %doc_id);
         continue;
         
 except Exception,  e:
     myDBConn.cursor.execute('UPDATE sites_sitequeue set status=400 WHERE id = %d;' %doc_id);
     continue;
 
 ##if we get here we are not blocked....yet.....reset the blocked counter if applicable##
 am_i_blocked_counter = 0;