psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) conn = psycopg2.connect(cp.get("search", "db")) curs = conn.cursor() # Start by indexing the main website log("Starting indexing of main website") SitemapSiteCrawler("www.postgresql.org", conn, 1, cp.get("search", "frontendip"), True).crawl() conn.commit() # Skip id=1, which is the main site.. curs.execute("SELECT id, hostname FROM sites WHERE id>1") for siteid, hostname in curs.fetchall(): log("Starting indexing of %s" % hostname) GenericSiteCrawler(hostname, conn, siteid).crawl() conn.commit() curs.execute( "WITH t AS (SELECT site,count(*) AS c FROM webpages GROUP BY site) UPDATE sites SET pagecount=t.c FROM t WHERE id=t.site" ) conn.commit() time.sleep(1) if __name__ == "__main__": cp = ConfigParser() cp.read("search.ini") threadwrapper(doit)
curs.execute( "WITH t AS (SELECT list,count(*) AS c FROM messages GROUP BY list) UPDATE lists SET pagecount=t.c FROM t WHERE id=t.list" ) # Indicate when we crawled curs.execute("UPDATE lastcrawl SET lastcrawl=CURRENT_TIMESTAMP") conn.commit() log("Indexed %s messages" % n) time.sleep(1) if __name__ == "__main__": parser = OptionParser() parser.add_option("-l", "--list", dest="list", help="Crawl only this list") parser.add_option("-m", "--month", dest="month", help="Crawl only this month") parser.add_option("-f", "--full", dest="full", action="store_true", help="Make a full crawl") parser.add_option("-t", "--status-interval", dest="status_interval", help="Seconds between status updates") parser.add_option("-c", "--commit-interval", dest="commit_interval", help="Messages between each commit") (opt, args) = parser.parse_args() if opt.full and opt.month: print "Can't use both full and specific month!" sys.exit(1) # assign default values opt.status_interval = opt.status_interval and int(opt.status_interval) or 30 opt.commit_interval = opt.commit_interval and int(opt.commit_interval) or 500 threadwrapper(doit, opt)
conn = psycopg2.connect(cp.get("search", "db")) curs = conn.cursor() # Start by indexing the main website log("Starting indexing of main website") SitemapSiteCrawler("www.postgresql.org", conn, 1, cp.get("search", "frontendip"), True).crawl() conn.commit() # Skip id=1, which is the main site.. curs.execute("SELECT id, hostname, https FROM sites WHERE id>1") for siteid, hostname, https in curs.fetchall(): log("Starting indexing of %s" % hostname) GenericSiteCrawler(hostname, conn, siteid, https).crawl() conn.commit() curs.execute( "WITH t AS (SELECT site,count(*) AS c FROM webpages GROUP BY site) UPDATE sites SET pagecount=t.c FROM t WHERE id=t.site" ) conn.commit() time.sleep(1) if __name__ == "__main__": cp = ConfigParser() cp.read("search.ini") threadwrapper(doit)
"--month", dest='month', help="Crawl only this month") parser.add_option("-f", "--full", dest='full', action="store_true", help="Make a full crawl") parser.add_option("-t", "--status-interval", dest='status_interval', help="Seconds between status updates") parser.add_option("-c", "--commit-interval", dest='commit_interval', help="Messages between each commit") (opt, args) = parser.parse_args() if opt.full and opt.month: print "Can't use both full and specific month!" sys.exit(1) # assign default values opt.status_interval = opt.status_interval and int( opt.status_interval) or 30 opt.commit_interval = opt.commit_interval and int( opt.commit_interval) or 500 threadwrapper(doit, opt)