Esempio n. 1
0
def run_linkspider(db, meta):
    source = 'douban'
    baseurls = [
        'http://www.douban.com/group/beijingzufang/discussion',
        'http://www.douban.com/group/fangzi/discussion',
        'http://www.douban.com/group/262626/discussion',
        'http://www.douban.com/group/276176/discussion',
        'http://www.douban.com/group/26926/discussion',
        'http://www.douban.com/group/sweethome/discussion',
        'http://www.douban.com/group/242806/discussion',
        'http://www.douban.com/group/257523/discussion',
        'http://www.douban.com/group/279962/discussion',
        'http://www.douban.com/group/334449/discussion',
    ]

    for baseurl in baseurls:
        Logger.info('start ' + baseurl)
        groupid = baseurl\
            .replace('http://www.douban.com/group/', '')\
            .replace('/discussion', '')
        reply_time = 0
        if meta.has(source, groupid)\
            and meta.get(source, groupid).has_key('reply_time'):
            reply_time = meta.get(source, groupid)['reply_time']
        linkspider = LinkSpider(
            baseurl=baseurl,
            db=db,
        )
        reply_time = linkspider.crawl(source=source,
                                      reply_time=reply_time,
                                      ext={'groupid': groupid})
        meta.set(source, groupid, {'reply_time': reply_time})
    meta.write()
Esempio n. 2
0
def diff_task(linkdb, output, pagelist):
    page_filenames = []
    #trick :)
    if os.path.exists(pagelist):
        Logger.info('Use pagelist indead of page files!')
        with open(pagelist, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    break
                filename = line
                page_filenames.append(filename)
    else:
        Logger.info('CANNOT find pagelist file: %s' % pagelist)

    tasks = {}
    sources = []
    with open(linkdb, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    hashurl, url, reply_time, source = line.split('\t')[:4]
                except: continue
                filename = '%s' % (hashurl)
                tasks.update({
                    filename:{
                        'url':url,
                        'source':source
                    }
                })
                sources.append(source)

    if not pagelist:
        for source in set(sources):
            source = os.path.join(output, source)
            if os.path.exists(source):
                filenames = os.listdir(source)
                for filename in filenames:
                    tasks.pop(filename) 
            else:
                os.mkdir(source)
    else:
        for source in set(sources):
            source = os.path.join(output, source)
            if not os.path.exists(source):
                os.mkdir(source)
        for filename in page_filenames:
            try:
                tasks.pop(filename) 
            except:
                print 'Skip', filename
    return tasks
Esempio n. 3
0
 def open(self, url, delay=0.1):
     response = None
     try:
         response = self.br.open(url, timeout=20.0)
     except urllib2.HTTPError, e:
         while e.code != 404:
             interval = Interval.val()
             time.sleep(interval)
             Logger.info('sleep %ds error %d %s' % (interval, e.code, url))
             try:
                 response = self.br.open(url, timeout=20.0)
                 Logger.info('skip 403 ' + url)
                 break
             except urllib2.HTTPError, e:
                 if e.code != 404:
                     continue
             except:
Esempio n. 4
0
 def open(self, url, delay=0.1):
     response = None
     try:
         response = self.br.open(url, timeout=20.0)
     except urllib2.HTTPError, e:
         while e.code != 404:
             interval = Interval.val()
             time.sleep(interval)
             Logger.info('sleep %ds error %d %s' % (interval, e.code, url))
             try:
                 response = self.br.open(url, timeout=20.0)
                 Logger.info('skip 403 ' + url)
                 break
             except urllib2.HTTPError, e:
                 if e.code != 404:
                     continue
             except:
Esempio n. 5
0
 def parse(self, **args):
     page = args['page']
     source = args['source']
     hashurl = args['hashurl']
     ret = PageParser.parse(page, source)
     if ret.has_key('error'):
         Logger.info(hashurl+' '+ret['error'])
         return
     record = '\t'.join([
         hashurl,
         ret['title2'] if ret['title2']\
             else ret['title'],
         json.dumps(ret['author']),
         json.dumps(ret['images']),
         json.dumps(ret['links']),
         ret['text'],
         ret['pub_time'],
     ]).encode('utf-8')
     self._db.insert(record)
Esempio n. 6
0
 def parse(self, **args):
     hashurl = args['hashurl']
     title = args['title']
     text = args['text']
     ret = TextParser.parse(title + ' ' + text)
     if ret.has_key('error'):
         Logger.info(hashurl + ' ' + ret['error'])
         return
     record = '\t'.join([
         hashurl,
         title,
         text,
         ret['jushi'],
         ret['shouji'],
         ret['zujin'],
         ret['dizhi'],
         ret['ditie'],
     ])
     self._db.insert(record)
Esempio n. 7
0
 def parse(self, **args):
     hashurl = args['hashurl']
     title = args['title']
     text = args['text']
     ret = TextParser.parse(title+' '+text)
     if ret.has_key('error'):
         Logger.info(hashurl+' '+ret['error'])
         return
     record = '\t'.join([
         hashurl,
         title,
         text,
         ret['jushi'],
         ret['shouji'],
         ret['zujin'],
         ret['dizhi'],
         ret['ditie'],
     ])
     self._db.insert(record)
Esempio n. 8
0
 def crawl(self, **args):
     source = args['source']
     ext = args['ext']
     reply_time = args['reply_time']
     br = Browser()
     page = br.open(self.baseurl)
     new_reply_time = reply_time
     while True:
         links = PageParser.parse(page, source)
         for i, link in enumerate(links):
             if reply_time < link.reply_time:
                 if i is 0:
                     new_reply_time = link.reply_time
                 self.db.insert('\t'.join([str(link), source, json.dumps(ext)]))
             else:
                 return new_reply_time
         try:
             page = br.follow_link(text='后页>')
         except:
             Logger.info('finished!')
             break
     return new_reply_time
Esempio n. 9
0
def run_linkspider(db, meta):
    source = 'douban'
    baseurls = [
        'http://www.douban.com/group/beijingzufang/discussion',
        'http://www.douban.com/group/fangzi/discussion',
        'http://www.douban.com/group/262626/discussion',
        'http://www.douban.com/group/276176/discussion',
        'http://www.douban.com/group/26926/discussion',
        'http://www.douban.com/group/sweethome/discussion',
        'http://www.douban.com/group/242806/discussion',
        'http://www.douban.com/group/257523/discussion',
        'http://www.douban.com/group/279962/discussion',
        'http://www.douban.com/group/334449/discussion',
    ]

    for baseurl in baseurls:
        Logger.info('start '+baseurl)        
        groupid = baseurl\
            .replace('http://www.douban.com/group/', '')\
            .replace('/discussion', '')
        reply_time = 0
        if meta.has(source, groupid)\
            and meta.get(source, groupid).has_key('reply_time'):
            reply_time = meta.get(source, groupid)['reply_time']
        linkspider = LinkSpider(
            baseurl=baseurl,
            db=db,
        )
        reply_time = linkspider.crawl(
            source=source,
            reply_time=reply_time,
            ext={
                'groupid':groupid
            }
        )
        meta.set(source, groupid, {
            'reply_time':reply_time
        })
    meta.write()
Esempio n. 10
0
 def crawl(self, **args):
     source = args['source']
     ext = args['ext']
     reply_time = args['reply_time']
     br = Browser()
     page = br.open(self.baseurl)
     new_reply_time = reply_time
     while True:
         links = PageParser.parse(page, source)
         for i, link in enumerate(links):
             if reply_time < link.reply_time:
                 if i is 0:
                     new_reply_time = link.reply_time
                 self.db.insert('\t'.join(
                     [str(link), source, json.dumps(ext)]))
             else:
                 return new_reply_time
         try:
             page = br.follow_link(text='后页>')
         except:
             Logger.info('finished!')
             break
     return new_reply_time
Esempio n. 11
0
        try:
            response = self.br.open(url, timeout=20.0)
        except urllib2.HTTPError, e:
            while e.code != 404:
                interval = Interval.val()
                time.sleep(interval)
                Logger.info('sleep %ds error %d %s' % (interval, e.code, url))
                try:
                    response = self.br.open(url, timeout=20.0)
                    Logger.info('skip 403 ' + url)
                    break
                except urllib2.HTTPError, e:
                    if e.code != 404:
                        continue
                except:
                    Logger.info('cannot handle browser error!')
                    break
            Interval.reset()
        except:
            pass
        time.sleep(delay)
        if response:
            page = response.read()
        else:
            page = ''
        return page

    def close(self):
        return self.br.close()

    def follow_link(self, **args):
Esempio n. 12
0
        # Fork - create daemon process
        try:
            pid = os.fork()
            if pid > 0:
                # I am the parent
                self.isDaemon = False
                pidFile.close()
                sys.exit(0)
            else:
                os.setsid()
                pid = os.fork()
                if pid > 0:
                    # write a pid to a file and leave the method
                    pidFile.write(str(pid) + " ")
                    LOGGER.info("A daemon with pidfile " + self.pidFile + " launched successfully, pid:" + str(pid))
                    self.isDaemon = False
                    pidFile.close()
                    return
                else:
                    # i'm child of a child - the daemon process, so continue
                    self.isDaemon = True
                    pidFile.close()
        except OSError, e:
            raise RuntimeError("Fork for runnable with pidfile " + self.pidFile + " failed: " + str(e))

        LOGGER.info("Running process with pidfile: " + self.pidFile + " [" + str(os.getpid()) + "]")
        sys.stdout.flush()
        # run the daemon tasks
        runnable.run()
Esempio n. 13
0
            and meta.get(source, groupid).has_key('reply_time'):
            reply_time = meta.get(source, groupid)['reply_time']
        linkspider = LinkSpider(
            baseurl=baseurl,
            db=db,
        )
        reply_time = linkspider.crawl(source=source,
                                      reply_time=reply_time,
                                      ext={'groupid': groupid})
        meta.set(source, groupid, {'reply_time': reply_time})
    meta.write()


def main(**args):
    modulepath = args['modulepath']
    linkdb = os.path.join(modulepath, '../output/link.db')
    backup(linkdb)

    linkmeta = os.path.join(modulepath, '../output/link.meta')
    backup(linkmeta)
    run_linkspider(DB(linkdb), Meta(linkmeta))


if __name__ == '__main__':
    starttime = datetime.datetime.now()
    run = sys.argv[0]
    modulepath = os.path.dirname(run)
    main(modulepath=modulepath)
    endtime = datetime.datetime.now()
    Logger.info('done! %lds' % (endtime - starttime).seconds)
Esempio n. 14
0
        try:
            response = self.br.open(url, timeout=20.0)
        except urllib2.HTTPError, e:
            while e.code != 404:
                interval = Interval.val()
                time.sleep(interval)
                Logger.info('sleep %ds error %d %s' % (interval, e.code, url))
                try:
                    response = self.br.open(url, timeout=20.0)
                    Logger.info('skip 403 ' + url)
                    break
                except urllib2.HTTPError, e:
                    if e.code != 404:
                        continue
                except:
                    Logger.info('cannot handle browser error!')
                    break
            Interval.reset()
        except:
            pass
        time.sleep(delay)
        if response:
            page = response.read()
        else:
            page = ''
        return page

    def close(self):
        return self.br.close()

    def follow_link(self, **args):
Esempio n. 15
0
                    merged[h] = (title2, url2, pub_time2)
                else:
                    title2, url2, pub_time2 = merged[h]
            else:
                title2, url2, pub_time2 = title, url, pub_time
            buf_list.append((h, title2, url2, pub_time2))
        if len(buf_list) > 1:
            buf_list = sorted(buf_list, key=lambda i:i[3], reverse=True)
            simdb.insert('\t'.join(
                [buf_list[0][0], json.dumps(buf_list[1:])]
            ))
     
def main(**args):
    modulepath = args['modulepath']
    finaldb_cut = os.path.join(modulepath, '../output/final.db.2')
    simdb = os.path.join(modulepath, '../output/sim.db')
    if os.path.exists(simdb):
        backup(simdb)
        os.remove(simdb)
    sim_merge(finaldb_cut, DB(simdb))

if __name__ == '__main__':
    starttime = datetime.datetime.now()   
    run = sys.argv[0]
    modulepath = os.path.dirname(run)
    main(
        modulepath=modulepath
    )
    endtime = datetime.datetime.now()   
    Logger.info('done! %lds' % (endtime-starttime).seconds)
Esempio n. 16
0
    def __init__(self):
        super().__init__()

        self.curse = CurseAPI()
        self.updatet = None
        self.child_wins = list()

        data_sent = False

        if not self.curse.baseDir:
            self.curse.baseDir = directoryBox(self, translate("prompt.mmc"))
            if not self.curse.baseDir:
                exit(1)
            self.curse.db["baseDir"] = self.curse.baseDir

        Logger.info("MultiMC folder is {}".format(self.curse.baseDir))

        if "analytics" not in self.curse.db:
            self.curse.db["analytics"] = confirmBox(
                self, QMessageBox.Question, translate("prompt.analytics"),
                QMessageBox.Yes)
            if self.curse.db["analytics"]:
                send_data(self.curse)
                data_sent = True

        if "ver" not in self.curse.db:
            if self.curse.db["analytics"] and not data_sent:
                send_data(self.curse)
            self.curse.db["ver"] = self.curse.version

        self.analytics = self.curse.db["analytics"]

        Logger.info("Analytics are {}".format(["Disabled",
                                               "Enabled"][self.analytics]))

        self.mmc = MultiMC(self.curse.baseDir)

        self.setWindowTitle("OpenMineMods v{}".format(CurseAPI.version))

        self.layout = QVBoxLayout(self)
        # Start Buttons
        self.buttonGroup = QGroupBox()
        self.layoutButtons = QHBoxLayout()
        self.buttonGroup.setLayout(self.layoutButtons)
        self.buttonGroup.setStyleSheet("QGroupBox { border:0; } ")

        refreshInstances = makeIconButton(
            self, "view-refresh", translate("tooltip.refresh.instances"))
        refreshInstances.clicked.connect(self.refresh_instances)

        brButton = makeIconButton(self, "search", "Browse Modpacks")
        brButton.clicked.connect(self.browse_clicked)

        settingsButton = makeIconButton(self, "configure",
                                        translate("tooltip.configure.omm"))
        settingsButton.clicked.connect(self.settings_clicked)

        self.layoutButtons.setAlignment(Qt.AlignTop)
        self.layoutButtons.addWidget(refreshInstances)
        self.layoutButtons.addWidget(brButton)
        self.layoutButtons.addWidget(settingsButton)
        self.layoutButtons.addStretch(1)
        self.layoutButtons.setContentsMargins(0, 0, 0, 0)
        self.layout.addWidget(self.buttonGroup)
        # End Buttons
        self.hGroupBox = QGroupBox(translate("label.instances"))
        self.layout.addWidget(self.hGroupBox)

        self.instanceTable = QVBoxLayout()

        self.init_instances()

        self.hGroupBox.setLayout(self.instanceTable)

        scroll = QScrollArea()
        scroll.setWidget(self.hGroupBox)
        scroll.setWidgetResizable(True)
        self.layout.addWidget(scroll)

        self.show()

        self.updatecheck = UpdateCheckThread(self.curse)
        self.updatecheck.done.connect(self.update_checked)

        self.update_thread = QThread()

        self.updatecheck.moveToThread(self.update_thread)

        self.update_thread.started.connect(self.updatecheck.check_updates)
        if isfile(join(getInstallDir(), 'AutoUpdate')):
            self.update_thread.start()
        try:
            f = open(dip, "r")
            cacheImgFile = open(cacheImg, "w")
            # buffsize = 52428800 #read 50 MB at a time
            buffsize = (1024 ** 3) / 2  # read/write 512 MB at a time
            buff = f.read(buffsize)
            while buff:
                cacheImgFile.write(buff)
                buff = f.read(buffsize)
            f.close()
            cacheImgFile.close()
        except IOError, e:
            msg = "Can't open %s: %s" % (dip, e)
            raise ClusterManagerException(msg)
        else:
            LOGGER.info(("Disk image %s  (for %s) copied to cache file %s") % (dip, hostObj.name, cacheImg))

    def defineHost(self, host):
        """
        Defines virtual host in a cluster using given host object,
        not starting it. Host with the given name may be defined once
        in the system. Stores hosts objects in class property self.hosts.
        
        @param host: ClusterManager.Host object
        @raise ClusterManagerException: when fails
        @return: host object from libvirt lib
        """
        if self.hosts.has_key(host.uname):
            return self.hosts[host.uname][0]

        for h in self.hosts.itervalues():