コード例 #1
0
ファイル: cmt.py プロジェクト: asciimoo/le-n-x
    def __init__(self, docid=None, *args,**kwargs):
        self.__dict__['type'] = 'co-ment'
        if docid:
            hostValidator = CMTRE.search(docid)
            if hostValidator:
                if hostValidator.group(1) or hostValidator.group(3) or hostValidator.group(5):
                    docid=("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8')
                    kwargs['docid']=docid
                url="https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')

                    dataurl = "https://%s/text%s/comments/" % (hostValidator.group(2), hostValidator.group(4))
                    data = urllib2.urlopen(dataurl).read()
                    soup = BeautifulSoup(data)

                    kwargs['raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (self.title, unescape(unicode(soup.find(attrs={'id' : 'textcontainer'}))).encode('utf8'))
                    kwargs['docid']=docid
                    super(Coment,self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid']=docid
        super(Coment,self).__init__(*args, **kwargs)
コード例 #2
0
    def __init__(self, docid=None, *args,**kwargs):
        self.__dict__['type'] = 'etherpad'
        if docid:
            hostValidator = PADRE.search(docid)
            if hostValidator:
                if hostValidator.group(2) and hostValidator.group(3):
                    docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8')
                    kwargs['docid']=docid
                url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')

                    doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8'))
                    raw=str(tidy.parseString(doc, **{'output_xhtml' : 1,
                                                             'add_xml_decl' : 0,
                                                             'indent' : 0,
                                                             'tidy_mark' : 0,
                                                             'doctype' : "strict",
                                                             'wrap' : 0}))
                    kwargs['raw'] = raw
                    kwargs['docid']=docid
                    super(Etherpad,self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid']=docid
        super(Etherpad,self).__init__(*args, **kwargs)
コード例 #3
0
    def __init__(self, docid=None, *args, **kwargs):
        self.__dict__['type'] = 'co-ment'
        if docid:
            hostValidator = CMTRE.search(docid)
            if hostValidator:
                if hostValidator.group(1) or hostValidator.group(
                        3) or hostValidator.group(5):
                    docid = ("%s%s" % (hostValidator.group(2),
                                       hostValidator.group(4))).encode('utf8')
                    kwargs['docid'] = docid
                url = "https://%s/text/%s/view/" % (hostValidator.group(2),
                                                    hostValidator.group(4))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title'] = unescape(
                        unicode(''.join(soup.title.findAll(
                            text=True)))).strip().encode('utf8')

                    dataurl = "https://%s/text%s/comments/" % (
                        hostValidator.group(2), hostValidator.group(4))
                    data = urllib2.urlopen(dataurl).read()
                    soup = BeautifulSoup(data)

                    kwargs[
                        'raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (
                            self.title,
                            unescape(
                                unicode(
                                    soup.find(attrs={'id': 'textcontainer'
                                                     }))).encode('utf8'))
                    kwargs['docid'] = docid
                    super(Coment, self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid'] = docid
        super(Coment, self).__init__(*args, **kwargs)