Beispiel #1
0
def create_stop_words():
    stop_list = [];
    stop_file = open('new_stop_words',  'r');
    for line in stop_file.readlines():
        line = line.replace('\n',  '');
        uummuuWord.append_word(stop_list,  line);
    return stop_list;
Beispiel #2
0
            print "found something other than text. Found:", myHTML.content_type.split("/")[0].lower()
            cursor.execute("UPDATE sites_sitequeue set status=501 WHERE id = %d;" % doc_id)
            continue

        try:
            if myHTML.redirected_url != "":
                print "got a redirected url:", myHTML.redirected_url
                redirected_domain = getDomain(myHTML.redirected_url)
                cursor.execute(
                    "INSERT INTO sites_sitequeue(url,crawled,domain,date_submitted, last_crawl, status) VALUES('%s',0,'%s',now(),now(),200);"
                    % (myHTML.redirected_url, redirected_domain)
                )
                cursor.execute("UPDATE sites_sitequeue set last_crawl = now() where id = %d;" % doc_id)
                cursor.execute("SELECT id FROM sites_sitequeue WHERE url = '%s';" % myHTML.redirected_url)
                redirected_to = cursor.fetchall()
                uummuuWord.append_word(link_list, redirected_to[0][0])
            if getDomain(site.url) != site.domain:
                site.domain = getDomain(site.url)
                cursor.execute("UPDATE sites_sitequeue SET domain = '%s' where id = %d;" % (site.domain, doc_id))
        except Exception, e:
            print "error 216:", e

            cursor.execute("UPDATE sites_sitequeue set status=300, crawled=1 where id = %d;" % (doc_id))
            conn.commit()
            continue
        ##initialize the parser ##
        parser = myParser(myurl=site.url, domain=site.domain)

        try:
            ## run the parser on the html from this page.##
            parser.parse(myHTML.page)