def updateinfo(): x = 0 while x < 25: sleep(1) #BTC to Usd First File = open("fileBtcUsd.txt","a") myurl = urllib.urlopen(URL_BTC_USD) html_string = myurl.read() text = html2text(html_string).strip("{}") splitText = text.split(":") splitText[0] = 1 + x splitText[1] = float(splitText[1]) File.write("\n%s" % str(splitText).strip("[]")) File.close() File = open("fileEosBtc.txt","a") myurl = urllib.urlopen(URL_EOS_BTC) html_string = myurl.read() text = html2text(html_string).strip("{}") splitText = text.split(":") splitText[0] = 1 + x splitText[1] = float(splitText[1]) File.write("\n%s" % str(splitText).strip("[]")) File.close() x += 1 print x
def number_words(self, page): text = None try: text = html2text(page.encode('utf8', 'ignore')) except UnicodeDecodeError: text = html2text(page) return len(re.findall(r'\w+', text))
def _parse_body(self, msg): content_type = msg.get_content_type() self.log.debug('Content-Type:' + content_type) if content_type == 'text/html': body = msg.get_payload(decode=True) charset = msg.get_content_charset() # need try: if charset != None: self.log.debug("charset:"+charset) body = self._to_unicode(body, charset) body = unicode(body) from stripogram import html2text, html2safehtml body = html2text(body) else: #body #self.log.debug(msg.get_content_type()) body = msg.get_payload(decode=1) charset = msg.get_content_charset() # need try: if charset != None: self.log.debug("charset:"+charset) body = self._to_unicode(body,charset) self.body = body
def index(urls): """ Goal: Download a list of webpage Parameter: urls: list of strings, which represent the address of each webpage """ if not os.path.isdir('files'): os.makedirs('files') for webpage in urls: name = webpage.split('/')[-1] os.system("wget "+webpage+ " -q -O files/"+name) logging.info("Downloaded: "+ name ) b_o_w = {} for web_file in os.listdir('files'): try: text_html = open('files/'+web_file,'r').read(); text = [stem(word.lower()) for word in html2text(text_html).split()] b_o_w[web_file] = text logging.info("Tokenized: "+web_file) except : #Something strange happened with the webpage of New_York_City print ("There is a problem with "+web_file) index_file = open("index_file.pck", "w") pickle.dump(b_o_w, index_file) index_file.close()
def convert(self, doc, encoding, mimetype, logError=False, raiseException=False): """Convert PowerPoint document to raw text""" tmp_name = self.saveFile(doc) err = TmpFile('') if sys.platform == 'win32': html = self.execute('ppthtml "%s" 2> "%s"' % ( tmp_name, str(err))) else: html = self.execute('ppthtml "%s" 2> "%s"' % ( tmp_name, str(err))) try: errors = open(str(err), 'r+').read() except OSError: errors = "" if errors: if logError: LOG.warn('Converter %s experienced an error %s' % ( self.content_description, errors) ) if raiseException: raise ConversionError(errors) return html2text(html, ignore_tags=('img',), indent_width=4, page_width=80), 'iso-8859-15'
def textread(content): strtext = html2text(str(content)) strtext = strtext.replace('\n', " ") strtext = strtext.replace('\"', "") strtext = strtext.replace('\'', "") strtext = strtext.lower() return strtext
def convert(self, html): """Convert html data to raw text""" return html2text(html, ignore_tags=('img',), indent_width=0, page_width=256)
def _processPageBody(self, page_body): """Process the link body with strip-o-gram library catching only the page content. """ ignored_tags = ('img', 'style') page_content = html2text(page_body, ignore_tags=ignored_tags) return page_content
def htmlToText(original_html): #clean_html = html2safehtml(original_html,valid_tags=("b", "a", "i", "br", "p")) # Don't process <img> tags, just strip them out. Use an indent of 4 spaces # and a page that's 80 characters wide. text = html2text(original_html,ignore_tags=("img",),indent_width=4,page_width=80) #text = html2text(original_html) return text
def oneshot_upload(request, entry_id): entry = models.Entry.objects.filter(name__exact=entry_id) if not entry: return HttpResponse('Invalid entry *short* name') entry = entry[0] challenge = entry.challenge version = int(request.POST.get('version', '1')) if version < 2: return HttpResponse('Please update your pyweek-upload.py script') data = request.POST user = request.POST.get('user', '') if not user: return HttpResponse('Invalid login') user = models.User.objects.filter(username__exact=user) if not user: return HttpResponse('Invalid login') user = user[0] password = request.POST.get('password', '') if not user.check_password(password): return HttpResponse('Invalid login') # check authorisation if not user in entry.users.all() or not entry.isUploadOpen(): return HttpResponse("You're not allowed to upload files!") # make sure user isn't sneeeky is_final = bool(request.POST.get('is_final', False)), if is_final and not challenge.isFinalUploadOpen(): return HttpResponse('Final uploads are not allowed now') # avoid dupes if os.path.exists( os.path.join(MEDIA_ROOT, str(challenge.number), entry.name, request.FILES['content_file'].name)): return HttpResponse('File with that filename already exists.') upload_file = request.FILES['content_file'] file = models.File( challenge=challenge, entry=entry, user=user, created=datetime.datetime.now(models.UTC), content=upload_file, description=html2text(data.get('description', '')), is_final=bool(data.get('is_final', False)), is_screenshot=bool(data.get('is_screenshot', False)), thumb_width=0, ) file.save() if file.is_final: entry.has_final = True entry.save() if data['is_screenshot']: try: _make_thumbnail(upload_file) except IOError as e: return HttpResponse('Error uploading screenshot: {}'.format(e)) return HttpResponse('File added!')
def analyze_results(self): #print "Put the code here to analyze the reviews" try: self.reset_data() if self.RatingSummary is not None: self.RatingSummary.pack_forget() self.dataText.pack(fill=Y) rdata=requests.get(self.URLtext.get()+self.requestCriteria) #url=self.URLtext.get() soup=BeautifulSoup(rdata.content) reviewSections=soup.findAll("div",{"class":"review-wrapper"}) if len(reviewSections) !=0: reviewSections.pop(0) for reviewSection in reviewSections: reviewContent= reviewSection.findAll('p')[0] self.reviewList.append(html2text(reviewContent.text)) #tkMessageBox.showinfo("URL entered",self.reviewList[0]) self.dataText.delete('1.0',END) reviewText="" for review in self.reviewList: reviewText=reviewText+review+("\n"*3) self.dataText.insert(END, reviewText) reviewSentiments=classifyReviews(self.reviewList) for reviewSentiment in reviewSentiments: if(reviewSentiment)=="pos": self.positiveRatings+=1 if(reviewSentiment)=="neg": self.negativeRatings+=1 except: print "Unexpected error:", sys.exc_info()[0] raise
def post(self): category = html2text(self.get_argument('data_name', '')) result = self.db.category.remove({'name': category}, safe=True) if result['n'] == 0: return self.response_json(0, 'Delete Fail') self.db.rss.update({'category': category}, {'$set': {'category': 'Other'}}) return self.response_json(1, 'Success')
def _strip_html(self, html): """ remove HTML for use in RSS """ if html2text is not None: it = ('img','a') text = html2text(html, ignore_tags=it, indent_width=4,page_width=80) return text return html
def _processPageBody(self, page_body, content_type): """Process the link body with strip-o-gram library catching only the page content. """ # XXX Improve by extracting text from other content types if content_type and 'html' not in content_type: return '' ignored_tags = ('img', 'style') page_content = html2text(page_body, ignore_tags=ignored_tags) return page_content
def singlePageScrape(no): global places h = HTMLParser.HTMLParser() page = requests.get('https://www.list.co.uk/places/location:Glasgow(55.8621,-4.2465)/distance:10/page:' + no + "/#results'") tree = html.fromstring(page.text) buyers = tree.xpath('//h2[@class="head"]/text()') prices = tree.xpath('//span[@class="postal-code"]/text()') count = 0 for item in buyers: places[str(item.encode('ascii','ignore'))] = str(html2text(prices[count]).encode('ascii','ignore')) count += 1
def post(self): category = html2text(self.get_argument('category', '')) if len(category) == 0: return self.response_json(0, 'Category Name Required') if self.db.category.find_one({'name': category}): return self.response_json(0, 'Category Exists') if self.db.category.insert({'name': category}): return self.response_json(1, 'Success') return self.response_json(0, 'Failed')
def handle(self, *args, **options): compiled_pattern = re.compile(PATTERN) #metro : print "metro" consolidated_text = "" for line in urllib2.urlopen(URL % METRO): consolidated_text += line print html2text(consolidated_text)[72:75] for result in compiled_pattern.findall(consolidated_text): (str_ligne, raison) = result incident = Incident() incident.line = Line.objects.get_or_create(name=str_ligne.strip())[0] incident.reason = raison.strip() incident.contributors = 'RATP' #rer : print "rer" consolidated_text = "" for line in urllib2.urlopen(URL % RER): consolidated_text += line for result in compiled_pattern.findall(consolidated_text): print result
def obtemHTML(self): try: httpconn = httplib.HTTPConnection(self.site) httpconn.request("GET", self.link) resp = httpconn.getresponse() html = resp.read() self.cache=self.cache+html2text(html) except: html = "" self.feed(html) if self.proximoLink() != '': self.obtemHTML() return html
def obtemHTML(self): try: httpconn = httplib.HTTPConnection(self.site) httpconn.request("GET", self.link) resp = httpconn.getresponse() html = resp.read() self.cache = self.cache + html2text(html) except: html = "" self.feed(html) if self.proximoLink() != '': self.obtemHTML() return html
def convert(self, doc, encoding=None, mimetype=None, logError=False, raiseException=False): # convert to unicode if not isinstance(doc, unicode): if not encoding: mo = charset_reg.search(doc) if mo is not None: encoding = mo.group(1) else: encoding = 'ascii' # guess doc = unicode(doc, encoding, 'replace') doc = convert_entities(doc) result = html2text(doc) # convert back to utf-8 return result.encode('utf-8'), 'utf-8'
def get_message_body(self, message): """ Get the body of an email Recursively look for the body for different mimetypes Returns the body as text/plain """ if 'payload' in message: return self.get_message_body(message['payload']) elif 'parts' in message: return self.get_message_body(message['parts'][0]) else: data = base64.urlsafe_b64decode(message['body']['data'].encode('ASCII')) markdown_data = html2text(data) data = data.replace("\n", "<br/>") # return {markdown, html} return {'markdown': unicode(markdown_data, "ISO-8859-1"), 'html': unicode(data, "ISO-8859-1")} if markdown_data else {'html': unicode(data, "ISO-8859-1")}
def send_html_mail_nt(subject, sender=settings.DEFAULT_FROM_EMAIL, recip="", context=None, html_template="", text_template="", sender_name="", html_content="", text_content="", recip_list=None, sender_formatted=""): from stripogram import html2text from feedparser import _sanitizeHTML if not context: context = {} if html_template: html = render(context, html_template) else: html = html_content if text_template: text = render(context, text_template) else: text = text_content if not text: text = html2text(_sanitizeHTML(html, charset)) if not recip_list: recip_list = [] if recip: recip_list.append(recip) try: if getattr(settings, "EMAIL_USE_SSL", False): server = SMTP_SSL(settings.EMAIL_HOST, settings.EMAIL_PORT) else: server = SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT) if settings.EMAIL_USE_TLS: server.ehlo() server.starttls() server.ehlo() if settings.EMAIL_HOST_USER and settings.EMAIL_HOST_PASSWORD: server.login(settings.EMAIL_HOST_USER, settings.EMAIL_HOST_PASSWORD) except Exception, e: print e return
def htmlmail(sbj,recip,msg,template='',texttemplate='',textmsg='',images=(), recip_name='',sender=settings.DEFAULT_FROM_EMAIL,sender_name='',charset=charset): """ if you want to use Django template system: use `msg` and optionally `textmsg` as template context (dict) and define `template` and optionally `texttemplate` variables. otherwise msg and textmsg variables are used as html and text message sources. if you want to use images in html message, define physical paths and ids in tuples. (image paths are relative to MEDIA_ROOT) example: images=(('email_images/logo.gif','img1'),('email_images/footer.gif','img2')) and use them in html like this: <img src="cid:img1"> ... <img src="cid:img2"> """ html=render(msg,template) if texttemplate or textmsg: text=render((textmsg or msg),texttemplate) else: text= html2text(_sanitizeHTML(html,charset)) msgRoot = MIMEMultipart('related') msgRoot['Subject'] = sbj msgRoot['From'] = named(sender,sender_name) msgRoot['To'] = named(recip,recip_name) msgRoot.preamble = 'This is a multi-part message in MIME format.' msgAlternative = MIMEMultipart('alternative') msgRoot.attach(msgAlternative) msgAlternative.attach(MIMEText(text, _charset=charset)) msgAlternative.attach(MIMEText(html, 'html', _charset=charset)) for img in images: fp = open(settings.MEDIA_ROOT+img[0], 'rb') msgImage = MIMEImage(fp.read()) fp.close() msgImage.add_header('Content-ID', '<'+img[1]+'>') msgRoot.attach(msgImage) smtp = SMTP() smtp.connect(smtp_server) if smtp_user: smtp.login(smtp_user, smtp_pass) smtp.sendmail(sender, recip, msgRoot.as_string()) smtp.quit()
def quote(self, result): """ """ #TODO: only return quotes when searchterm is not in Title search = self.request.SearchableText if result.portal_type in ('JournalPaper'): result_text = html2text(result.getAbstract, ignore_tags=('a','span','br','p')) else: #Due to catalog out of sync, result.SearchableText sometimes returns #'Missing.Value', while result.getObject().SearchableText returns #as expected. So in that case we must get the object. We test this #with basestring (True for str and unicode) if hasattr(result, 'SearchableText'): if isinstance(result.SearchableText, basestring): result_text = result.SearchableText else: result_text = result.getObject().SearchableText() else: result_text = result.getObject().SearchableText() #TODO catch errors quotes = [] quotes = set(quotes) search_text = self.request.SearchableText.split() for t in search_text: t = t.lower() t = t.replace('"', '') if t in result_text.lower(): lines = re.split(r'\s*[!?.]\s*', result_text) for line in lines: if t in line.lower() and len(quotes) < 3: quotes.add(line) return quotes
def scolar_news_summary_rss(context, title, sco_url, n=5): """rss feed for scolar news""" news = scolar_news_summary(context,n=n) items = [] for n in news: text = html2text(n['text']) items.append( PyRSS2Gen.RSSItem( title= unicode( '%s %s' % (n['rssdate'], text), SCO_ENCODING), link = sco_url + '/' + n['url'], pubDate = n['date822'] )) rss = PyRSS2Gen.RSS2( title = unicode(title, SCO_ENCODING), link = sco_url, description = unicode(title, SCO_ENCODING), lastBuildDate = datetime.datetime.now(), items = items ) f = StringIO() rss.write_xml(f) f.seek(0) data = f.read() f.close() return data
def send_html_mail_nt( subject, sender=settings.DEFAULT_FROM_EMAIL, recip="", context=None, html_template="", text_template="", sender_name="", html_content="", text_content="", recip_list=None, sender_formatted="" ): from stripogram import html2text from feedparser import _sanitizeHTML if not context: context = {} if html_template: html = render(context, html_template) else: html = html_content if text_template: text = render(context, text_template) else: text = text_content if not text: text = html2text(_sanitizeHTML(html,charset)) if not recip_list: recip_list = [] if recip: recip_list.append(recip) try: if getattr(settings, "EMAIL_USE_SSL", False): server = SMTP_SSL(settings.EMAIL_HOST, settings.EMAIL_PORT) else: server = SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT) if settings.EMAIL_USE_TLS: server.ehlo() server.starttls() server.ehlo() if settings.EMAIL_HOST_USER and settings.EMAIL_HOST_PASSWORD: server.login( settings.EMAIL_HOST_USER, settings.EMAIL_HOST_PASSWORD ) except Exception, e: print e return
from stripogram import html2text import sys import httplib2 if __name__ == '__main__': http = httplib2.Http() headers, body = http.request(sys.argv[1]) ctype = headers['content-type'] charset = ctype[ctype.index('charset=')+8:] body = body.decode('UTF-8') try: text = html2text(body.encode('utf8', 'ignore')) except UnicodeDecodeError: text = html2text(body) print text
messages.append(source_message) print messages for item in messages: for destination in sync.destination.all(): print destination.sn_type if destination.sn_type.code == 'vk' and destination.enabled: vk = VK(vk_settings) attachments = [] message = "" if 'title' in item: message = "%s" % (item['title']) #, html2safehtml(item['text']) if 'text' in item: try: message+=" %s" % html2text(item['text']) except Exception: message+=" %s" % strip_tags(item['text']) if item['attachements']: for attach in item['attachements']: attachments.append(attach['src']) if message: res = vk.VKPost(destination, message, attachments) try: js = json.loads(res) if 'error' in js: raise Exception, 'error found in vk responce' else: print res except Exception:
def html_to_text(url): myurl = urllib.urlopen(url) html_string = myurl.read() text = html2text(html_string) return text
def entry_upload(request, entry_id): if request.user.is_anonymous(): return HttpResponseRedirect('/login/') entry = get_object_or_404(models.Entry, pk=entry_id) challenge = entry.challenge is_member = request.user in entry.users.all() if not is_member or not entry.isUploadOpen(): messages.error(request, "You're not allowed to upload files!") return HttpResponseRedirect('/e/%s/'%entry_id) if request.method == 'POST': f = FileForm(request.POST, request.FILES) else: f = FileForm() info = { 'challenge': challenge, 'entry': entry, 'files': entry.file_set.all(), 'is_member': True, 'is_owner': True, 'form': f, } # just display the form? if not f.is_valid(): return render_to_response('challenge/entry_file.html', info, context_instance=RequestContext(request)) # make sure user isn't sneeeky if f.cleaned_data['is_final'] and not challenge.isFinalUploadOpen(): f._errors['is_final'] = f.error_class(["Final uploads are not allowed now."]) return render_to_response('challenge/entry_file.html', info, context_instance=RequestContext(request)) # avoid dupes if os.path.exists(os.path.join(MEDIA_ROOT, str(challenge.number), entry.name, request.FILES['content'].name)): f._errors['content'] = f.error_class(["File with that filename already exists."]) return render_to_response('challenge/entry_file.html', info, context_instance=RequestContext(request)) file = models.File( challenge=challenge, entry=entry, user=request.user, created=datetime.datetime.utcnow(), content=request.FILES['content'], description=html2text(f.cleaned_data['description']), is_final=f.cleaned_data['is_final'], is_screenshot=f.cleaned_data['is_screenshot'], thumb_width=0, ) file.save() if file.is_final: entry.has_final = True entry.save() if file.is_screenshot: try: _make_thumbnail(file) except: # XXX need feedback with custom error "file is not an image" messages.error(request, 'File is not an image') return render_to_response('challenge/entry_file.html', info, context_instance=RequestContext(request)) messages.success(request, 'File added!') return HttpResponseRedirect('/e/%s/'%entry_id)
if msgparttype == 'text/plain': emailTEXT = payload elif msgparttype == 'text/html': emailHTML = payload else: payload = msg.get_payload(decode=True) if contenttype[0] == 'text/plain': emailTEXT = payload elif contenttype[0] == 'text/html': emailHTML = payload # If not TEXT version exists, convert HTML version to TEXT if not emailTEXT: if emailHTML: emailTEXT = stripogram.html2text( emailHTML.decode( "utf-8", "replace").encode("utf-8")).lstrip() # Get Yahoo Term Extractor generated terms # Results in UnicodeDecodeError on some emails which can be ignored similar to this # http://code.djangoproject.com/attachment/ticket/1086/feeds.py.2.diff # or replace them similar to https://bugs.launchpad.net/gpodder/+bug/252506 keywords = None if emailTEXT: keywords = termExtraction(appid, emailTEXT)[-5:] # Encode the emailTEXT and emailHTML for insertion into sqlite newer versions if emailTEXT: emailTEXT = emailTEXT.decode('utf-8', 'replace') if emailHTML: emailHTML = emailHTML.decode('utf-8', 'replace')
def get_context(lines, i, nmax): c = html2text(string.join(lines[i:i + 5])) c = c.replace("\n", " ") c = ' '.join(c.split()) c = c[:nmax] return c
def upload_award(request, entry_id): creator = request.user if creator.is_anonymous(): return HttpResponseRedirect('/login/') entry = get_object_or_404(models.Entry, pk=entry_id) challenge = entry.challenge is_member = creator in entry.users.all() if is_member: messages.error(request, 'You cannot give an award to your own entry!') return HttpResponseRedirect('/e/%s/' % entry_id) info = dict( challenge=challenge, entry=entry, awards=creator.award_set.all(), give_form=GiveAwardForm(creator), ) errors = None if request.method == 'POST': f = UploadAwardForm(request.POST, request.FILES) else: f = UploadAwardForm() info['upload_form'] = f # Display form if not f.is_valid(): return render(request, 'challenge/upload_award.html', info) # make sure the filename is unique # if os.path.exists(fspath): # error = 'You have already uploaded an award image with that filename.' # check dimensions of image ok = False try: image = Image.open(request.FILES['content']) if image.size == (64, 64): ok = True except: pass if not ok: messages.error(request, 'The image could not be read or is not 64x64') return render(request, 'challenge/upload_award.html', info) # Write award image to disk award = models.Award( creator=creator, content=request.FILES['content'], description=html2text(f.cleaned_data['description']), ) award.save() if _give_award(challenge, creator, entry, award): messages.success(request, 'Award given!') else: messages.error(request, 'This entry already has that award.') return HttpResponseRedirect('/e/%s/' % entry_id)
opfilepath = opdirpath + allNames[len(allNames) - 1] soup = None try: soup = bs(open(filepath)) soup = soup.body except Exception: print("cannot open:" + filepath, file=error) continue if not soup: print("contains no body:" + filepath, file=error) continue heading = soup.find('h1', class_='firstHeading') if heading: heading = html2text(str(heading)).strip() paragraphlist = soup.findAll('p') if paragraphlist or heading: try: opfile = open(opfilepath, 'w+') except Exception: print("cannot create o/p filename:" + opfilepath, file=error) continue else: continue if heading: print(heading, file=opfile) print(heading, file=opfile)
def get_text2(url): from stripogram import html2text r = requests.get(url) text = html2text(r.content).encode('utf-8') #print text return text
import urllib from stripogram import html2text myurl = urllib.urlopen("http://tuxworld.wordpress.com") html_string = myurl.read() text = html2text( html_string ) print text
def entry_display(request, entry_id): entry = get_object_or_404(models.Entry, pk=entry_id) challenge = entry.challenge user_list = entry.users.all() is_member = request.user in list(user_list) files = entry.file_set.filter( is_screenshot__exact=True).order_by("-created")[:1] thumb = None if files: thumb = files[0] # handle adding the ratings form and accepting ratings submissions f = None if entry.may_rate(request.user, challenge) and challenge.isRatingOpen(): errors = {} # get existing scores rating = None for rating in entry.rating_set.filter(user__id__exact=request.user.id): break # fields for rating editing if request.method == 'POST': f = RatingForm(request.POST) if f.is_valid(): if rating is not None: # edit existing rating.disqualify = f.cleaned_data['disqualify'] rating.nonworking = f.cleaned_data['nonworking'] rating.fun = f.cleaned_data['fun'] rating.innovation = f.cleaned_data['innovation'] rating.production = f.cleaned_data['production'] rating.comment = html2text(f.cleaned_data['comment']) else: # create new rating = models.Rating( entry=entry, user=request.user, disqualify=f.cleaned_data['disqualify'], nonworking=f.cleaned_data['nonworking'], fun=f.cleaned_data['fun'], innovation=f.cleaned_data['innovation'], production=f.cleaned_data['production'], comment=html2text(f.cleaned_data['comment']), ) rating.save() messages.info(request, 'Ratings saved!') return HttpResponseRedirect("/e/%s/" % entry.name) elif rating is not None: data = dict(disqualify=rating.disqualify, nonworking=rating.nonworking, fun=rating.fun, innovation=rating.innovation, production=rating.production, comment=rating.comment) f = RatingForm(data) else: f = RatingForm() rating_results = False if challenge.isAllDone() and entry.has_final: # display ratings d = rating_results = entry.tally_ratings() d['dp'] = '%d%%' % (d.get('disqualify', 0) * 100) d['dnwp'] = '%d%%' % (d.get('nonworking', 0) * 100) return render_to_response('challenge/entry.html', { 'challenge': challenge, 'entry': entry, 'files': entry.file_set.all(), 'thumb': thumb, 'diary_entries': entry.diary_entries(), 'is_user': not request.user.is_anonymous(), 'is_member': is_member, 'is_team': len(user_list) > 1, 'is_owner': entry.user == request.user, 'form': f, 'rating': rating_results, 'awards': entry.entryaward_set.all(), }, context_instance=RequestContext(request))
# webExtractor # Created by JKChang # 30/05/2017, 21:39 # Tag:web extractor # Description: import urllib.request, urllib.error, urllib.parse import requests from bs4 import BeautifulSoup from stripogram import html2text import re url = 'http://www.w3resource.com/python-exercises/python-basic-exercises.php' html = requests.get(url).text text = html2text(html) l = text.encode('utf-8').split('\n') index = 1 pat = '(/d+)/. ' for line in l: # if len(line) ==0: # continue # elif line[0].isdigit(): print(line) # if line[0].isdigit(): # print line.encode('utf-8') # print html.encode('utf-8') # # soup = BeautifulSoup(html,'lxml') # res = soup.findAll("article", {"class": "listingItem"})
n = re.sub('<(\w+@\w+(?:\.\w+)+)>','',m) remitente = n ##print (remitente) ##print ("------------------------------------------") typ,asunto = M.fetch(num,'(BODY[HEADER.FIELDS (SUBJECT)])') #Quitamos la palabra "Subject" por un espacio en blanco m = re.sub('^Subject: ','',asunto[0][1]) textoasunto = m ##print (textoasunto) ##print ("------------------------------------------") typ,cuerpo = M.fetch(num,'(BODY[TEXT])') texto = cuerpo[0][1] #la parte del update propietarioI=remitente emailpropietarioI=direccion asuntoI=textoasunto #Esto hace que el cuerpo se pueda insertar de manera correcta... #textoI= unicode(texto,"latin-1") textoI2=unicode(html2text(texto,ignore_tags=("img",),indent_width=4,page_width=80),"latin-1") fechaI=datetime.datetime.now() # #Insertar la incidencia como pendiente , mirar siempre que sea relativa al estado que toda de los pendientes... # relestado_idI='1' c.execute("INSERT INTO gestorincidencias_incidencias (propietario,emailpropietario,asunto,texto,fecha,relestado_id) VALUES (%s,%s,%s,%s,%s,%s)",(propietarioI,emailpropietarioI,asuntoI,textoI2,fechaI,relestado_idI)) conn.commit() M.store(num, '+FLAGS', '\\Deleted') M.expunge() M.close() M.logout()
payload = msgpart.get_payload(decode=True) if msgparttype == 'text/plain': emailTEXT = payload elif msgparttype == 'text/html': emailHTML = payload else: payload = msg.get_payload(decode=True) if contenttype[0] == 'text/plain': emailTEXT = payload elif contenttype[0] == 'text/html': emailHTML = payload # If not TEXT version exists, convert HTML version to TEXT if not emailTEXT: if emailHTML: emailTEXT = stripogram.html2text(emailHTML.decode("utf-8","replace").encode("utf-8")).lstrip() # Get Yahoo Term Extractor generated terms # Results in UnicodeDecodeError on some emails which can be ignored similar to this # http://code.djangoproject.com/attachment/ticket/1086/feeds.py.2.diff # or replace them similar to https://bugs.launchpad.net/gpodder/+bug/252506 keywords = None if emailTEXT: keywords = termExtraction(appid, emailTEXT)[-5:] # Encode the emailTEXT and emailHTML for insertion into sqlite newer versions if emailTEXT: emailTEXT = emailTEXT.decode('utf-8', 'replace') if emailHTML: emailHTML = emailHTML.decode('utf-8', 'replace')
def test(): gp = {} for d in mdb.deputes.find({},{'stats.election':1,'groupe_abrev':1}): g = d['groupe_abrev'] if not g in gp.keys(): gp[g] = [] gp[g].append(d['stats']['election']['inscrits']) moy = {} import numpy for g,v in gp.iteritems(): moy[g] = numpy.median(numpy.array(v)) return json_response(moy) stats = dict(groupe=0,dissidence=0,abstention=0) for s in mdb.scrutins.find({'scrutin_num':{'$nin':[404,405,406]}},{'scrutin_positions':1}): spos = s['scrutin_positions']['REM'] for pos in ['pour','contre','abstention']: if pos in ['pour','contre']: if spos['position']!=pos: stats['dissidence'] += spos.get(pos,0) else: stats['groupe'] += spos.get(pos,0) else: stats[pos] += spos.get(pos,0) return json_response(stats) from obsapis.tools import parse_content import requests from lxml import etree url = "http://www.assemblee-nationale.fr/15/dossiers/dons_jours_repos_aidants_familiaux.asp" #url = "http://www.assemblee-nationale.fr/15/dossiers/soutien_collectivites_accueil_gens_voyage.asp" r = requests.get(url) xml = parse_content(r.content) print xml.xpath('//a[text()[contains(.,"Proposition de loi")]]/@href') from stripogram import html2text, html2safehtml doc = html2text(r.content,page_width=10000).decode('iso8859-1').split(u'\n\n') start = False bloc = "" done = False for i,l in enumerate(doc): l = l.replace(u'1ère',u'première') if 'Proposition de loi' in l: start = True done = False elif len(l)<4: if start == True: start = False done = True if start: bloc += l if done: print bloc done = False start = False m1 = re.search(r'n. *([0-9]+).*d\xe9pos\xe9e? le ([0-9]+ [^ ]+ [0-9]+).*mis en ligne le ([0-9]+ [^ ]+ [0-9]+).*renvoy\xe9e? \xe0 (.*)',bloc) #m1 = re.search(r'n° *([0-9]+).*d\xe9pos\xe9e? le ([0-9]+ [^ ]+ [0-9]+).*mis en ligne le ([0-9]+ [^ ]+ [0-9]+).*renvoy\xe9e? \xe0 (.*)',bloc) #print m1 if m1: print m1.groups() bloc = "" return "ok" gps = {} import datetime return json_response(mdb.amendements.find_one({'auteurs':None})) for d in mdb.deputes.find({'depute_actif':True},{'depute_ddn':1,'groupe_abrev':1,'groupe_libelle':1}): age = (datetime.datetime.now()-datetime.datetime.strptime(d['depute_ddn'],'%d/%m/%Y')).days/365.25 gps[d['groupe_libelle']] = gps.get(d['groupe_libelle'],[]) + [age] from numpy import median,average for k,v in gps.iteritems(): print v print "%s - moyenne : %.2f, mediane : %.2f" % (k,average(v),median(v)) #for i,d in enumerate(mdb.documentsan.find()): # d['contenu'] = d['titre'] + d.get('contenu','') # mdbrw.documentsan.update_one({'id':d['id']},{'$set':{'contenu':d['contenu']}}) # print i #import_amendements() return json_response(gps) #mdbrw.travaux.remove({'sort':'44'}) #update_travaux() #return json_response(list(mdb.travaux.find({'sort':'44'}))) #return json_response(mdb.questions.find_one({})) #import_qag() return json_response(mdb.travaux.find_one()) #return json_response(mdb.interventions.find({'itv_rapporteur':None}))) #return json_response(mdb.interventions.find({'itv_rapporteur':None}).distinct('itv_date')) #return json_response(mdb.interventions.find({'$and':[{'itv_rapporteur':True},{'depute_shortid':'ericcoquerel'}]})) from obsapis.controllers.admin.updates.interventions import update_stats_interventions deppdp = {} #return json_response(update_stats_interventions()) for pdp in update_stats_interventions(): dep = pdp['_id'].get('depute',None) if dep: if not dep in deppdp.keys(): deppdp[dep]= dict(n=0,rap=0) deppdp[dep]['rap' if pdp['_id']['rapporteur'] else 'n'] += pdp['n'] return json_response(', '.join('%d. %s (%d)' % (i+1,d[0],d[1]['n']+d[1]['rap']) for i,d in enumerate(sorted(deppdp.items(),key=lambda x:x[1]['n']+x[1]['rap'],reverse=True)))) counts = {} nbmembres = dict((g['groupe_abrev'],g['groupe_nbmembres']) for g in mdb.groupes.find({},{'groupe_abrev':1,'groupe_nbmembres':1})) for q in mdb.questions.find({'groupe':{'$ne':None}},{'groupe':1}): g = q['groupe'] if not g in counts.keys(): counts[g] = 0 counts[g] += 1 return json_response([ "%s (%d)" % (g,n/nbmembres[g]) for g,n in sorted(counts.items(),key=lambda x:x[1]/nbmembres[x[0]],reverse=True)]) col = [] for d in mdb.deputes.find({},{'depute_collaborateurs_hatvp':1,'_id':None,'depute_shortid':1}): col.append((d['depute_shortid'],len(d.get('depute_collaborateurs_hatvp',[])))) return json_response(sorted(col,key=lambda x:x[1],reverse=True)[:20]) import datetime #mdbrw.deputes.update_one({'depute_shortid':'michelevictory'},{'$unset':{'stats.commissions':""}}) return json_response(mdb.deputes.find_one({},{'depute_hatvp':1})) return json_response([d['depute_shortid'] for d in mdb.deputes.find({'stats.commissions.present':0.0})]) #{'$and': [{'depute_actif': True}, ]} [('stats.nonclasse', 1), ('stats.ranks.down.exprimes', 1)] return json_response(list(d['depute_shortid'] for d in mdb.deputes.find({'depute_mandat_debut':{'$gte':datetime.datetime(2017,5,21)}},{'depute_shortid':1}))) return json_response([d['depute_shortid'] for d in mdb.deputes.find({'$and':[{'$or':[{'depute_actif': True},{'depute_shortid':'michelevictory'}]},{u'stats.positions.exprimes': {'$ne': None}}]}).sort([('stats.nonclasse', 1), ('stats.ranks.down.exprimes', 1)]).limit(5)]) for d in mdb.deputes.find({'depute_election':None}): circo = d['depute_circo_id'] titulaire = mdb.deputes.find_one({'$and':[{'depute_circo_id':circo},{'depute_election':{'$ne':None}}]}) mdbrw.deputes.update_one({'depute_shortid':d['depute_shortid']},{'$set':{'depute_election':titulaire['depute_election']}}) return "oj" #mdbrw.questions.update_many({'legislature':None},{'$set':{'legislature':15}}) #update_travaux() #return json_response(mdb.interventions.find_one({})) return json_response(list(q['itv_contenu_texte'] for q in mdb.interventions.find({'depute_shortid':'mariechristineverdierjouclas'}))) return json_response(mdb.travaux.distinct('type')) #for a in mdb.amendements.find({'suppression':True},{'id':1}): # mdbrw.travaux.update_many({'idori':a['id']},{'$set':{'suppression':True}}) #mdbrw.travaux.remove({'idori':'S-AMANR5L15PO419610B155N7'}) #mdbrw.amendements.remove({'id':{'$in':amdlist}}) #mdbrw.travaux.remove({'idori':{'$in':amdlist}}) #import_amendements() return json_response(list(q['description'] for q in mdb.travaux.find({'$and':[{'auteur':{'$ne':False}},{'type':'QE'},{'depute':'francoisruffin'}]}))) return json_response(list(mdb.travaux.find({'idori':'S-AMANR5L15PO419610B155N7'}))) print mdb.travaux.count() return json_response(list(t['description'] for t in mdb.travaux.find({'groupe':'FI'}))) #updateDeputesTravaux() #importdocs() #import_qag() return json_response(mdb.deputes.find_one({'depute_shortid':'francoisruffin'})) #importdocs() #return json_response(mdb.documentsan.find_one({'$and':[{'typeid':'propositionderesolution'},{'cosignataires.id':'francoisruffin'}]})) ops = [] pgroup = {'n':{'$sum':1}} pgroup['_id'] = {'depute':'$auteurs'} pipeline = [{'$match':{}}, {'$unwind':'$auteurs'},{"$group": pgroup }] #'scrutin_typedetail':'amendement' return json_response(sum(d['n'] for d in mdb.documentsan.aggregate(pipeline))) print len(list(mdb.documentsan.aggregate(pipeline))),mdb.documentsan.count() #return json_response(mdb.amendements.find({'suppression':True},{'dispositif':1}).count()) #mdbrw.scrutins.update_one({'scrutin_num':324},{'$set':{'scrutin_liendossier':'http://www.assemblee-nationale.fr/15/dossiers/deuxieme_collectif_budgetaire_2017.asp'}}) #return json_util.dumps(list(mdb.amendements.find({'numAmend':'426'}))) #mdbrw.scrutins.update_one({'scrutin_num':1},{'$set':{'scrutin_groupe':'Gouvernement','scrutin_lientexte':[(u'déclaration de politique générale', # 'http://www.gouvernement.fr/partage/9296-declaration-de-politique-generale-du-premier-ministre-edouard-philippe', # #mdbrw.votes.update_many({'scrutin_num':1},{'$set':{'scrutin_groupe':'Gouvernement'}}) #return json_response([ (d['depute_shortid'],d['depute_mandat_fin_cause']) for d in mdb.deputes.find({'depute_actif':False},{'depute_shortid':1,'depute_mandat_fin_cause':1,'_id':None})]) #mdbrw.scrutins.update_one({'scrutin_num':357},{'$set':{'scrutin_lientexte.0.1':'http://www.assemblee-nationale.fr/15/dossiers/jeux_olympiques_paralympiques_2024.asp#'}}) #return json_response(mdb.scrutins.find_one({'scrutin_num':357})) return json_response(mdb.documentsan.distinct('type')) # visuels pgroup = {} pgroup['n'] = {'$sum':1} pgroup['_id'] = { 'depute':'$depute'} pipeline = [{'$match':{'name':'visuelstat'}},{'$group':pgroup}] vdeps = [] for g in mdb.logs.aggregate(pipeline): _g = g['_id']['depute'] if _g != None: vdeps.append((_g,g['n'])) return ", ".join([ "%s (%s)" % i for i in sorted(vdeps,key=lambda x:x[1],reverse=True)]) #updateDeputesContacts() return json_util.dumps(mdb.deputes.find_one({'depute_shortid':'nicolelepeih'},{'depute_contacts':1,'_id':None})) #importdocs() #return json_util.dumps(list(mdb.logs.find({'name':'visuelstat'}))) mts = list(mdb.scrutins.find({ '$text': { '$search': "rejet" } },{'scrutin_groupe':1,'scrutin_fulldesc':1,'scrutin_sort':1,'_id':None})) _mts = "\n".join([";".join([m.get('scrutin_groupe',''),m['scrutin_sort'],m['scrutin_fulldesc']]) for m in mts]) print _mts return json_util.dumps(mdb.deputes.find_one({'depute_shortid':'thierrysolere'},{'stats':1,'_id':None})) return json_util.dumps([(d['depute_nom'], d['stats']['positions']['exprimes'], d['stats']['votesamdements']['pctpour'], d['depute_shortid']) for d in mdb.deputes.find({'groupe_abrev':'REM','stats.positions.exprimes':{'$gt':20}},{'depute_nom':1,'depute_shortid':1,'stats.positions.exprimes':1,'stats.votesamdements.pctpour':1}).sort([('stats.votesamdements.pctpour',-1)]).limit(20)]) from fuzzywuzzy import fuzz sdesc = [(s['scrutin_dossier'],s['scrutin_dossierLibelle'],s['scrutin_desc'][20:]) for s in mdb.scrutins.find({'scrutin_dossier':{'$ne':'N/A'}},{'scrutin_dossier':1,'scrutin_dossierLibelle':1,'scrutin_desc':1,'_id':None})] r = [] for s in mdb.scrutins.find({'scrutin_dossier':'N/A'},{'scrutin_desc':1,'_id':None,'scrutin_id':1}): for dos,doslib,d in sdesc: fz = fuzz.partial_ratio(s['scrutin_desc'][20:],d) if fz>97: r.append((s['scrutin_id'],dos,doslib)) break return json_util.dumps(r) return json_util.dumps([(d['depute_shortid'],d['depute_suppleant'],d['depute_mandat_fin']) for d in mdb.deputes.find({'depute_actif':False})]) return json_util.dumps(list(mdb.amendements.find({'sort':u"Adopt\u00e9","signataires_groupes":{'$elemMatch':{'$eq':'FI'}}},{'_id':None,'numInit':1,'numAmend':1})))
def getDossier(url): ops = [] r = requests.get(url) texte = "NOPE" doc = html2text(r.content, page_width=10000).decode('iso8859-1').split(u'\n\n') for i, l in enumerate(doc): l = l.replace(u'1ère', u'première').replace(u'2e ', u'deuxième ') #print l search = False if l[0:21] == u'Assemblée nationale -': if l[22:38] == u'première lecture' or l[ 22:38] == u'Nouvelle lecture': _l = l[39:] j = 0 while not (u"proposition de loi" in _l.lower() or u'projet de loi' in _l.lower() or j > 5): j += 1 _l = doc[i + j].replace(u'1ère', u'première').replace( u'2e ', u'deuxième ') texte = _l.split(u',')[0] else: _l = l search = True lecture = u' '.join(l[22:].split(' ')[0:2]) if l[0:21] == u'Travaux préparatoires': j = 0 while not (u"proposition de résolution" in _l.lower() or j > 5): j += 1 _l = doc[i + j].replace(u'1ère', u'première') texte = _l.split(u',')[0] search = True lecture = "" if l[0:26] == u"Commission Mixte Paritaire": j = 0 m = None while not m and j <= 5: j += 1 m = re.search( u"sous le n° ([0-9]+) +à l'Assemblée nationale", doc[i + j]) n = m.groups()[0] if m else None if n and n in docsan.keys(): ops.append( (texte, "texte de la commission mixte paritaire", docsan[num], num, docsan[n], n)) #print (texte,"",docsan[num]) if search: m = re.search(u", *(TA|) +n *° *([0-9]+)[^\-]* *", _l) if m: if m.groups()[0] == 'TA': num = "TA%04d" % int(m.groups()[1]) else: num = m.groups()[1] ops.append((texte, lecture, docsan[num], num)) #print (texte,lecture,docsan[num]) return ops
opfilepath = opdirpath + allNames[len(allNames)-1] soup = None try: soup = bs(open(filepath)) soup = soup.body except Exception : print("cannot open:" + filepath, file=error); continue; if not soup: print("contains no body:" + filepath , file = error) continue; heading = soup.find('h1',class_='firstHeading') if heading: heading = html2text(str(heading)).strip() paragraphlist = soup.findAll('p') if paragraphlist or heading: try: opfile = open(opfilepath, 'w+') except Exception : print("cannot create o/p filename:"+opfilepath, file=error) continue; else: continue; if heading:
import sys import json def get_words(text): return re.compile('\w+').findall(text) if __name__ == "__main__": f_in = open(sys.argv[1]) f_out = open(sys.argv[2], 'w') pages = json.load(f_in) json_doc = [] df = defaultdict(int) for page in pages: try: text = html2text(page['html'].encode('utf8', 'ignore')) except UnicodeDecodeError: text = html2text(page['html']) words = get_words(text) n = len(words) tf = defaultdict(int) for word in words: tf[word] +=1 for word in tf: tf[word] /= float(n) json_doc.append({'url': page['url'], 'features': tf}) words_set = set(words) for word in words_set: df[word] +=1 n = len(pages) for word in df: df[word] /= float(n) for row in json_doc: features = row['features'] for word in features:
def summary(self): ''' summary text - remove HTML and truncate ''' text = html2text(self.content) if len(text) > 255: text = text[:252] + '...' return text
def obtemTexto(self): self.conteudo=self.removerEspacos() self.conteudo=self.removerTagsHTML() return html2text(self.conteudo)
def get_feature_vector(words, df): tf = defaultdict(int) for word in words: tf[word] += 1 n = len(words) for word in tf: tf[word] /= float(n) res = defaultdict(int) for word in tf: if word in df: res[word] = tf[word]/float(df[word]) return res if __name__ == '__main__': url, model = sys.argv[1], sys.argv[2] page = urlopen(url).read() try: text = html2text(page.encode('utf8', 'ignore')) except UnicodeDecodeError: text = html2text(page) dataset = json.load(open(model)) texts = dataset[:-1] df = dataset[-1] vec = get_feature_vector(get_words(text), df) print vec # similarity_list = [] # for text in texts: # similarity_list.append((cos(vec, text['features']), text['url'])) # # similarity_list.sort(reverse=True) # for entry in similarity_list
return False if __name__ == '__main__': f = open('uk2002-spamlabels1.txt') f1 = open('uk2002-working.txt', "wb") line = f.readline() i = 0 while line: print i i = i + 1 space = line.split() site = 'http://' + space[0] ##print site; ##print checkUrl(site) now = time.time() if (checkUrl(site)): later = time.time() difference = int(later - now) try: with Timeout(5): file("docs/" + space[0] + ".txt", "w").write(html2text(urllib2.urlopen(site).read())) print site + " " + str(difference) f1.write(site + " " + space[1] + " " + str(difference) + "\n") except: print(site + " " + "non-responsive\n") line = f.readline() f.close() f1.close()