def __init__(self, docid=None, *args,**kwargs): self.__dict__['type'] = 'co-ment' if docid: hostValidator = CMTRE.search(docid) if hostValidator: if hostValidator.group(1) or hostValidator.group(3) or hostValidator.group(5): docid=("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8') kwargs['docid']=docid url="https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8') dataurl = "https://%s/text%s/comments/" % (hostValidator.group(2), hostValidator.group(4)) data = urllib2.urlopen(dataurl).read() soup = BeautifulSoup(data) kwargs['raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (self.title, unescape(unicode(soup.find(attrs={'id' : 'textcontainer'}))).encode('utf8')) kwargs['docid']=docid super(Coment,self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid']=docid super(Coment,self).__init__(*args, **kwargs)
def __init__(self, docid=None, *args,**kwargs): self.__dict__['type'] = 'etherpad' if docid: hostValidator = PADRE.search(docid) if hostValidator: if hostValidator.group(2) and hostValidator.group(3): docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8') kwargs['docid']=docid url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8') doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8')) raw=str(tidy.parseString(doc, **{'output_xhtml' : 1, 'add_xml_decl' : 0, 'indent' : 0, 'tidy_mark' : 0, 'doctype' : "strict", 'wrap' : 0})) kwargs['raw'] = raw kwargs['docid']=docid super(Etherpad,self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid']=docid super(Etherpad,self).__init__(*args, **kwargs)
def __init__(self, docid=None, *args, **kwargs): self.__dict__['type'] = 'co-ment' if docid: hostValidator = CMTRE.search(docid) if hostValidator: if hostValidator.group(1) or hostValidator.group( 3) or hostValidator.group(5): docid = ("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8') kwargs['docid'] = docid url = "https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title'] = unescape( unicode(''.join(soup.title.findAll( text=True)))).strip().encode('utf8') dataurl = "https://%s/text%s/comments/" % ( hostValidator.group(2), hostValidator.group(4)) data = urllib2.urlopen(dataurl).read() soup = BeautifulSoup(data) kwargs[ 'raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % ( self.title, unescape( unicode( soup.find(attrs={'id': 'textcontainer' }))).encode('utf8')) kwargs['docid'] = docid super(Coment, self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid'] = docid super(Coment, self).__init__(*args, **kwargs)