def run_linkspider(db, meta): source = 'douban' baseurls = [ 'http://www.douban.com/group/beijingzufang/discussion', 'http://www.douban.com/group/fangzi/discussion', 'http://www.douban.com/group/262626/discussion', 'http://www.douban.com/group/276176/discussion', 'http://www.douban.com/group/26926/discussion', 'http://www.douban.com/group/sweethome/discussion', 'http://www.douban.com/group/242806/discussion', 'http://www.douban.com/group/257523/discussion', 'http://www.douban.com/group/279962/discussion', 'http://www.douban.com/group/334449/discussion', ] for baseurl in baseurls: Logger.info('start ' + baseurl) groupid = baseurl\ .replace('http://www.douban.com/group/', '')\ .replace('/discussion', '') reply_time = 0 if meta.has(source, groupid)\ and meta.get(source, groupid).has_key('reply_time'): reply_time = meta.get(source, groupid)['reply_time'] linkspider = LinkSpider( baseurl=baseurl, db=db, ) reply_time = linkspider.crawl(source=source, reply_time=reply_time, ext={'groupid': groupid}) meta.set(source, groupid, {'reply_time': reply_time}) meta.write()
def diff_task(linkdb, output, pagelist): page_filenames = [] #trick :) if os.path.exists(pagelist): Logger.info('Use pagelist indead of page files!') with open(pagelist, 'r') as f: for line in f: line = line.strip() if not line: break filename = line page_filenames.append(filename) else: Logger.info('CANNOT find pagelist file: %s' % pagelist) tasks = {} sources = [] with open(linkdb, 'r') as f: for line in f: line = line.strip() if line: try: hashurl, url, reply_time, source = line.split('\t')[:4] except: continue filename = '%s' % (hashurl) tasks.update({ filename:{ 'url':url, 'source':source } }) sources.append(source) if not pagelist: for source in set(sources): source = os.path.join(output, source) if os.path.exists(source): filenames = os.listdir(source) for filename in filenames: tasks.pop(filename) else: os.mkdir(source) else: for source in set(sources): source = os.path.join(output, source) if not os.path.exists(source): os.mkdir(source) for filename in page_filenames: try: tasks.pop(filename) except: print 'Skip', filename return tasks
def open(self, url, delay=0.1): response = None try: response = self.br.open(url, timeout=20.0) except urllib2.HTTPError, e: while e.code != 404: interval = Interval.val() time.sleep(interval) Logger.info('sleep %ds error %d %s' % (interval, e.code, url)) try: response = self.br.open(url, timeout=20.0) Logger.info('skip 403 ' + url) break except urllib2.HTTPError, e: if e.code != 404: continue except:
def open(self, url, delay=0.1): response = None try: response = self.br.open(url, timeout=20.0) except urllib2.HTTPError, e: while e.code != 404: interval = Interval.val() time.sleep(interval) Logger.info('sleep %ds error %d %s' % (interval, e.code, url)) try: response = self.br.open(url, timeout=20.0) Logger.info('skip 403 ' + url) break except urllib2.HTTPError, e: if e.code != 404: continue except:
def parse(self, **args): page = args['page'] source = args['source'] hashurl = args['hashurl'] ret = PageParser.parse(page, source) if ret.has_key('error'): Logger.info(hashurl+' '+ret['error']) return record = '\t'.join([ hashurl, ret['title2'] if ret['title2']\ else ret['title'], json.dumps(ret['author']), json.dumps(ret['images']), json.dumps(ret['links']), ret['text'], ret['pub_time'], ]).encode('utf-8') self._db.insert(record)
def parse(self, **args): hashurl = args['hashurl'] title = args['title'] text = args['text'] ret = TextParser.parse(title + ' ' + text) if ret.has_key('error'): Logger.info(hashurl + ' ' + ret['error']) return record = '\t'.join([ hashurl, title, text, ret['jushi'], ret['shouji'], ret['zujin'], ret['dizhi'], ret['ditie'], ]) self._db.insert(record)
def parse(self, **args): hashurl = args['hashurl'] title = args['title'] text = args['text'] ret = TextParser.parse(title+' '+text) if ret.has_key('error'): Logger.info(hashurl+' '+ret['error']) return record = '\t'.join([ hashurl, title, text, ret['jushi'], ret['shouji'], ret['zujin'], ret['dizhi'], ret['ditie'], ]) self._db.insert(record)
def crawl(self, **args): source = args['source'] ext = args['ext'] reply_time = args['reply_time'] br = Browser() page = br.open(self.baseurl) new_reply_time = reply_time while True: links = PageParser.parse(page, source) for i, link in enumerate(links): if reply_time < link.reply_time: if i is 0: new_reply_time = link.reply_time self.db.insert('\t'.join([str(link), source, json.dumps(ext)])) else: return new_reply_time try: page = br.follow_link(text='后页>') except: Logger.info('finished!') break return new_reply_time
def run_linkspider(db, meta): source = 'douban' baseurls = [ 'http://www.douban.com/group/beijingzufang/discussion', 'http://www.douban.com/group/fangzi/discussion', 'http://www.douban.com/group/262626/discussion', 'http://www.douban.com/group/276176/discussion', 'http://www.douban.com/group/26926/discussion', 'http://www.douban.com/group/sweethome/discussion', 'http://www.douban.com/group/242806/discussion', 'http://www.douban.com/group/257523/discussion', 'http://www.douban.com/group/279962/discussion', 'http://www.douban.com/group/334449/discussion', ] for baseurl in baseurls: Logger.info('start '+baseurl) groupid = baseurl\ .replace('http://www.douban.com/group/', '')\ .replace('/discussion', '') reply_time = 0 if meta.has(source, groupid)\ and meta.get(source, groupid).has_key('reply_time'): reply_time = meta.get(source, groupid)['reply_time'] linkspider = LinkSpider( baseurl=baseurl, db=db, ) reply_time = linkspider.crawl( source=source, reply_time=reply_time, ext={ 'groupid':groupid } ) meta.set(source, groupid, { 'reply_time':reply_time }) meta.write()
def crawl(self, **args): source = args['source'] ext = args['ext'] reply_time = args['reply_time'] br = Browser() page = br.open(self.baseurl) new_reply_time = reply_time while True: links = PageParser.parse(page, source) for i, link in enumerate(links): if reply_time < link.reply_time: if i is 0: new_reply_time = link.reply_time self.db.insert('\t'.join( [str(link), source, json.dumps(ext)])) else: return new_reply_time try: page = br.follow_link(text='后页>') except: Logger.info('finished!') break return new_reply_time
try: response = self.br.open(url, timeout=20.0) except urllib2.HTTPError, e: while e.code != 404: interval = Interval.val() time.sleep(interval) Logger.info('sleep %ds error %d %s' % (interval, e.code, url)) try: response = self.br.open(url, timeout=20.0) Logger.info('skip 403 ' + url) break except urllib2.HTTPError, e: if e.code != 404: continue except: Logger.info('cannot handle browser error!') break Interval.reset() except: pass time.sleep(delay) if response: page = response.read() else: page = '' return page def close(self): return self.br.close() def follow_link(self, **args):
# Fork - create daemon process try: pid = os.fork() if pid > 0: # I am the parent self.isDaemon = False pidFile.close() sys.exit(0) else: os.setsid() pid = os.fork() if pid > 0: # write a pid to a file and leave the method pidFile.write(str(pid) + " ") LOGGER.info("A daemon with pidfile " + self.pidFile + " launched successfully, pid:" + str(pid)) self.isDaemon = False pidFile.close() return else: # i'm child of a child - the daemon process, so continue self.isDaemon = True pidFile.close() except OSError, e: raise RuntimeError("Fork for runnable with pidfile " + self.pidFile + " failed: " + str(e)) LOGGER.info("Running process with pidfile: " + self.pidFile + " [" + str(os.getpid()) + "]") sys.stdout.flush() # run the daemon tasks runnable.run()
and meta.get(source, groupid).has_key('reply_time'): reply_time = meta.get(source, groupid)['reply_time'] linkspider = LinkSpider( baseurl=baseurl, db=db, ) reply_time = linkspider.crawl(source=source, reply_time=reply_time, ext={'groupid': groupid}) meta.set(source, groupid, {'reply_time': reply_time}) meta.write() def main(**args): modulepath = args['modulepath'] linkdb = os.path.join(modulepath, '../output/link.db') backup(linkdb) linkmeta = os.path.join(modulepath, '../output/link.meta') backup(linkmeta) run_linkspider(DB(linkdb), Meta(linkmeta)) if __name__ == '__main__': starttime = datetime.datetime.now() run = sys.argv[0] modulepath = os.path.dirname(run) main(modulepath=modulepath) endtime = datetime.datetime.now() Logger.info('done! %lds' % (endtime - starttime).seconds)
try: response = self.br.open(url, timeout=20.0) except urllib2.HTTPError, e: while e.code != 404: interval = Interval.val() time.sleep(interval) Logger.info('sleep %ds error %d %s' % (interval, e.code, url)) try: response = self.br.open(url, timeout=20.0) Logger.info('skip 403 ' + url) break except urllib2.HTTPError, e: if e.code != 404: continue except: Logger.info('cannot handle browser error!') break Interval.reset() except: pass time.sleep(delay) if response: page = response.read() else: page = '' return page def close(self): return self.br.close() def follow_link(self, **args):
merged[h] = (title2, url2, pub_time2) else: title2, url2, pub_time2 = merged[h] else: title2, url2, pub_time2 = title, url, pub_time buf_list.append((h, title2, url2, pub_time2)) if len(buf_list) > 1: buf_list = sorted(buf_list, key=lambda i:i[3], reverse=True) simdb.insert('\t'.join( [buf_list[0][0], json.dumps(buf_list[1:])] )) def main(**args): modulepath = args['modulepath'] finaldb_cut = os.path.join(modulepath, '../output/final.db.2') simdb = os.path.join(modulepath, '../output/sim.db') if os.path.exists(simdb): backup(simdb) os.remove(simdb) sim_merge(finaldb_cut, DB(simdb)) if __name__ == '__main__': starttime = datetime.datetime.now() run = sys.argv[0] modulepath = os.path.dirname(run) main( modulepath=modulepath ) endtime = datetime.datetime.now() Logger.info('done! %lds' % (endtime-starttime).seconds)
def __init__(self): super().__init__() self.curse = CurseAPI() self.updatet = None self.child_wins = list() data_sent = False if not self.curse.baseDir: self.curse.baseDir = directoryBox(self, translate("prompt.mmc")) if not self.curse.baseDir: exit(1) self.curse.db["baseDir"] = self.curse.baseDir Logger.info("MultiMC folder is {}".format(self.curse.baseDir)) if "analytics" not in self.curse.db: self.curse.db["analytics"] = confirmBox( self, QMessageBox.Question, translate("prompt.analytics"), QMessageBox.Yes) if self.curse.db["analytics"]: send_data(self.curse) data_sent = True if "ver" not in self.curse.db: if self.curse.db["analytics"] and not data_sent: send_data(self.curse) self.curse.db["ver"] = self.curse.version self.analytics = self.curse.db["analytics"] Logger.info("Analytics are {}".format(["Disabled", "Enabled"][self.analytics])) self.mmc = MultiMC(self.curse.baseDir) self.setWindowTitle("OpenMineMods v{}".format(CurseAPI.version)) self.layout = QVBoxLayout(self) # Start Buttons self.buttonGroup = QGroupBox() self.layoutButtons = QHBoxLayout() self.buttonGroup.setLayout(self.layoutButtons) self.buttonGroup.setStyleSheet("QGroupBox { border:0; } ") refreshInstances = makeIconButton( self, "view-refresh", translate("tooltip.refresh.instances")) refreshInstances.clicked.connect(self.refresh_instances) brButton = makeIconButton(self, "search", "Browse Modpacks") brButton.clicked.connect(self.browse_clicked) settingsButton = makeIconButton(self, "configure", translate("tooltip.configure.omm")) settingsButton.clicked.connect(self.settings_clicked) self.layoutButtons.setAlignment(Qt.AlignTop) self.layoutButtons.addWidget(refreshInstances) self.layoutButtons.addWidget(brButton) self.layoutButtons.addWidget(settingsButton) self.layoutButtons.addStretch(1) self.layoutButtons.setContentsMargins(0, 0, 0, 0) self.layout.addWidget(self.buttonGroup) # End Buttons self.hGroupBox = QGroupBox(translate("label.instances")) self.layout.addWidget(self.hGroupBox) self.instanceTable = QVBoxLayout() self.init_instances() self.hGroupBox.setLayout(self.instanceTable) scroll = QScrollArea() scroll.setWidget(self.hGroupBox) scroll.setWidgetResizable(True) self.layout.addWidget(scroll) self.show() self.updatecheck = UpdateCheckThread(self.curse) self.updatecheck.done.connect(self.update_checked) self.update_thread = QThread() self.updatecheck.moveToThread(self.update_thread) self.update_thread.started.connect(self.updatecheck.check_updates) if isfile(join(getInstallDir(), 'AutoUpdate')): self.update_thread.start()
try: f = open(dip, "r") cacheImgFile = open(cacheImg, "w") # buffsize = 52428800 #read 50 MB at a time buffsize = (1024 ** 3) / 2 # read/write 512 MB at a time buff = f.read(buffsize) while buff: cacheImgFile.write(buff) buff = f.read(buffsize) f.close() cacheImgFile.close() except IOError, e: msg = "Can't open %s: %s" % (dip, e) raise ClusterManagerException(msg) else: LOGGER.info(("Disk image %s (for %s) copied to cache file %s") % (dip, hostObj.name, cacheImg)) def defineHost(self, host): """ Defines virtual host in a cluster using given host object, not starting it. Host with the given name may be defined once in the system. Stores hosts objects in class property self.hosts. @param host: ClusterManager.Host object @raise ClusterManagerException: when fails @return: host object from libvirt lib """ if self.hosts.has_key(host.uname): return self.hosts[host.uname][0] for h in self.hosts.itervalues():