def get_moodle_section(self, sectionid, chapter, activity_title=""): ''' sectionid is a number ''' sdir = 'sections/section_%s' % sectionid xml = etree.parse('%s/%s/section.xml' % (self.moodle_dir, sdir)).getroot() name = xml.find('name').text contents = xml.find('summary').text if contents is None: contents = '' contents = contents.replace('<o:p></o:p>', '') # if moodle author didn't bother to set name, but instead used <h2> then grab name from that if not name or name=='$@NULL@$': m = re.search('<h2(| align="left")>(.*?)</h2>', contents) if m: name = html2text.html2text(m.group(2)) name = name.replace('\n','').replace('\r','') if not name or name=='$@NULL@$': htext = html2text.html2text(contents) # print "Warning: empty name for section %s, contents=%s ..." % (sectionid, htext.split('\n')[0].strip()) name = htext[:50].split('\n')[0].strip() if not name: name = activity_title.strip().split('\n')[0].strip()[:50] name = name.strip() print "--> Section: %s" % name chapter.set('display_name', name) if contents: seq = etree.SubElement(chapter,'sequential') self.set_sequential_name(seq, name) url_name = self.make_url_name('section_%s__%s' % (sectionid, name), dupok=False) self.save_as_html(url_name, name, contents, seq) return seq return None
def parse_ticket(self, id): # Use CSV export to get ticket fields url = self.full_url(self.TICKET_URL % id, 'csv') f = self.csvopen(url) reader = csv.DictReader(f) ticket_fields = reader.next() ticket_fields['class'] = 'ARTIFACT' ticket = self.remap_fields(ticket_fields) # Use RSS export to get ticket description and comments import html2text html2text.BODY_WIDTH = 0 url = self.full_url(self.TICKET_URL % id, 'rss') self.log_url(url) d = feedparser.parse(urlopen(url)) ticket['description'] = html2text.html2text(d.feed.description) comments = [] for comment in d['entries']: c = {} c['submitter'] = getattr(comment, 'author', None) c['date'] = comment.updated_parsed c['comment'] = html2text.html2text(comment.summary) c['class'] = 'COMMENT' comments.append(c) ticket['comments'] = comments return ticket
def prompt_repost(request, story): story = Post.objects.get(slug=story) prompt = story.parent.body prompt_url = story.parent.reddit_url r = praw.Reddit(user_agent='Test Script by /u/raymestalez') r.login(os.environ["REDDIT_USERNAME"],os.environ["REDDIT_PASSWORD"]) subreddit = r.get_subreddit('OrangeMind') # thread = list(r.search("[WP]", subreddit=subreddit, sort="new", syntax='cloudsearch'))[0] if story.reddit_url: comment = r.get_submission(story.reddit_url).comments[0] comment.edit(html2text(story.body)) else: thread = r.get_submission(url = prompt_url) comment = thread.add_comment(html2text(story.body)) story.reddit_url = comment.permalink story.save() teststring = "<br/>url: " + prompt_url + \ "<br/>Reddit premalink: " + comment.permalink return render(request, 'posts/test.html', { 'teststring': teststring, })
def run_thread(self, start, lim): client = MongoClient('localhost', 27017) my_db = client['Grupo07'] questions= my_db.movies_questions.find().sort("question_id", 1).skip(start).limit(lim) i=0 for q in questions: i= i+1 print str(start) +" : " +str(i) q_topics= [] q_topics = q_topics + self.extract_topics_list(q['title']) q_topics = q_topics + self.extract_topics_list(html2text.html2text(q['body'])) if "answers" in q.keys(): for a in q["answers"]: q_topics = q_topics + self.extract_topics_list(html2text.html2text(a['body'])) topics_dict={} for w in q_topics: if w in topics_dict: topics_dict[w]=topics_dict[w] + 1 else: topics_dict[w]=1 for k in topics_dict.keys(): topicObj = my_db.movies_trending_topics.find_one({"word": k}) if topicObj!=None: topicObj['count']=topicObj['count']+topics_dict[k]; my_db.movies_trending_topics.update({'_id': topicObj['_id']}, {"$set": topicObj}, upsert=False) else: topicObj= {"word": k, "count": topics_dict[k]} my_db.movies_trending_topics.insert(topicObj)
def readFileAtPath(self, posix_path): """ Reads a file at a given path. Looks for utf-8/latin-1 encoding. Converts HTML Markup to Text. Class counts failed attempts to read. @parameters posix_path string the concerned filepath at which the method should read @returns string html-free content of filepath bool FALSE if encoding unknown or file not found """ print("parsing: "+posix_path.name) try: with posix_path.open(encoding="utf-8") as f: # general encoding return html2text(f.read()) except UnicodeDecodeError: try: with posix_path.open(encoding="latin-1") as f: # german language encoding return html2text(f.read()) except: self.fails.append(posix_path.name) return False except: self.fails.append(posix_path.name) return False
def handle(self, *args, **options): for row in all_portfolio_rows(): w = Work.objects.create( name_he=row.get(u'שם העבודה', ''), name_en=row.get('Document Title', ''), description_he=html2text(row.get(u'תאור', '')), description_en=html2text(row.get(u'Description', '')), discipline=match_discipline(row), country=match_country(row.get(u'ארץ', '')), designer=Designer.objects.get_or_create( name_he=row.get(u'מעצב', ''), defaults={'name_en': row.get('Designer', '')})[0], category=match_category(row.get(u'קטגוריה')), size_as_text=row.get(u'גודל', ''), publish_date_as_text=row.get(u'תאריך', ''), publish_year=int(row.get(u'תאריך', ''))if row.get( u'תאריך', '').isdigit() else None, client=row.get(u'לקוח', ''), technique=match_technique(row.get(u'טכניקה', '')), is_self_collected=match_is_self_collected( row.get(u'מעצב', ''), row.get(u'מאוסף', '')), raw_image=File(open(os.path.join( settings.PORFOLIO_IMAGE_DIR, row['Filename']))) ) w.subjects = match_subject(row.get(u'נושא')) w.of_collections = match_collector(row.get(u'מעצב', ''), row.get(u'מאוסף', '')) for keyword in [keyword.strip() for keyword in html2text(row.get(u'מילות מפתח', '')).split(',')]: if keyword: w.tags.add(keyword)
def _set_text_and_samples(tree: lxml.html.HtmlElement, problem: Problem) -> None: text = tree.get_element_by_id('problem_text') source = text.find_class('problem_source')[0] source.getparent().remove(source) input_next = False output_next = False for div in text.iterchildren(): if div.text in ['Input', 'Исходные данные']: input_next = True elif div.text in ['Output', 'Результат']: output_next = True elif input_next: input_next = False problem.input = html2text(lxml.html.tostring(div).decode('utf-8')).strip() elif output_next: output_next = False problem.output = html2text(lxml.html.tostring(div).decode('utf-8')).strip() else: continue div.getparent().remove(div) samples = text.find_class('sample') if len(samples) == 1: sample = samples[0] sample_h3 = text.find_class('problem_subtitle') sample_texts = [u'Sample', u'Пример', u'Samples', u'Примеры'] sample_h3 = next(x for x in sample_h3 if x.text in sample_texts) sample.getparent().remove(sample) sample_h3.getparent().remove(sample_h3) intables = sample.find_class('intable') problem.sample_inputs = [x.text.rstrip() for x in intables[0::2]] problem.sample_outputs = [x.text.rstrip() for x in intables[1::2]] problem.text = html2text(lxml.html.tostring(text).decode('utf-8')).strip()
def build_whoosh_database(): analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True, analyzer=analyzer), type=TEXT(stored=True), link=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) ix = create_in(whoosh_database, schema) writer = ix.writer() uu = MApp() tt = uu.get_all() for rec in tt: text2 = html2text.html2text(tornado.escape.xhtml_unescape(rec.cnt_html)) writer.add_document( title=rec.title, type='<span style="color:red;">[信息]</span>', link='/info/{0}'.format(rec.uid), content= text2, ) mpost = MPost() recs = mpost.query_all() for rec in recs: text2 = html2text.html2text(tornado.escape.xhtml_unescape(rec.cnt_html)) print(text2) writer.add_document( title=rec.title, type='<span style="color:blue;">[文档]</span>', link='/post/{0}.html'.format(rec.uid), content=text2 ) writer.commit()
def get_plain_text(self): action = html2text(smart_str(self.action)).rstrip() effect = html2text(smart_str(self.effect)).rstrip() setup = html2text(smart_str(self.setup)).rstrip() breakdown = html2text(smart_str(self.breakdown)).rstrip() return PlainText(action=action, setup=setup, effect=effect, breakdown=breakdown)
def write_page(url, number): """write_page Write the text in a html page in a file. The file name is composed of the date and a number. :param url: The url where the page can be found :param number: A number to make files unique """ html = get_html_page(url) date = find_date_page(html) if date: cat, title, snippet, text = find_text_page(html) text = html2text.html2text(text).replace("\\n", " ") text = text.replace("\\r", "").replace("\n", " ") title = html2text.html2text(title).replace("\\n", " ") title = title.replace("\\r", "").replace("\n", " ") snippet = html2text.html2text(snippet).replace("\\n", " ") snippet = snippet.replace("\\r", "").replace("\n", " ") cat = html2text.html2text(cat).replace("\\n", " ") cat = cat.replace("\\r", "").replace("\n", " ") print("Writing file number ", str(number).zfill(3), url) fstream = open("20minutes/"+date+"-"+str(number).zfill(3), "w", encoding="utf-16") fstream.write("<category>\n") fstream.write(cat) fstream.write("\n<\\category>\n") fstream.write("<title>\n") fstream.write(title) fstream.write("\n<\\title>\n") fstream.write("<snippet>\n") fstream.write(snippet) fstream.write("\n<\\snippet>\n") fstream.write("<article>\n") fstream.write(text) fstream.write("\n<\\article>") fstream.close()
def scrape_comments(issue): # This is a hack since the current BitBucket api does not support pulling comments. url = "https://bitbucket.org/%s/%s/issue/%s" % ( options.bitbucket_username, options.bitbucket_repo, issue["local_id"], ) content = urllib2.urlopen(url).read() bs = BeautifulSoup(content) comments = [] for comment in bs.findAll("li", {"class": " comment-content"}): body = comment.find("div", {"class": "issues-comment edit-comment"}) if body: body = html2text(unicode(body)) else: # This is not a comment it is a issue change body = html2text(unicode(comment.find("ul", {"class": "issue-changes"}))) body = clean_body(body) user = "******" try: user = comment.findAll("a")[1].getText() except IndexError: pass created_at = comment.find("time").get("datetime") number = int(comment.find("a", {"class": "issues-comments-permalink"}).getText().replace("#", "")) comments.append({"user": user, "created_at": created_at, "body": body.encode("utf-8"), "number": number}) return comments
def workitem_details(client, workitemid): wi = Workitem.getOne(client, workitemid, '?oslc_cm.properties=dc:identifier,\ dc:type{dc:title},dc:title,rdf:resource,dc:creator{dc:title},\ rtc_cm:ownedBy{dc:title},dc:description,rtc_cm:state{dc:title},\ rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.parent{dc:identifier,dc:title},\ rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.children,\ rtc_cm:com.ibm.team.workitem.linktype.relatedworkitem.related') print print "==================================================================" print "Workitem ID : " +cl.str(str(wi.js['dc:identifier']), cl.fg.green)+' ('+wi.js['dc:type']['dc:title']+')' print "Title : " +cl.str(wi.js['dc:title'], cl.fg.red) print "URL : " +wi.js['rdf:resource'] print "State : " +wi.stateColorize(wi.js['rtc_cm:state']['dc:title']) print "Creator : " +wi.js['dc:creator']['dc:title'] print "Owner : " +wi.js['rtc_cm:ownedBy']['dc:title'] if len(wi.js['rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.parent']) != 0: par = wi.js['rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.parent'][0] print "Parent : " + str(par['dc:identifier'])+" ("+par['dc:title']+")" if len(wi.js['rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.children']) != 0: print "Child(ren) : " + reduce((lambda a, b: a +", "+ b), map((lambda a: re.sub(r'([^:]+): (.*)', r'\1 (\2)', a['oslc_cm:label'])), wi.js['rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.children'])) if len(wi.js['rtc_cm:com.ibm.team.workitem.linktype.relatedworkitem.related']) != 0: print "Related : " + reduce((lambda a, b: a +", "+ b), map((lambda a: re.sub(r'([^:]+): (.*)', r'\1 (\2)', a['oslc_cm:label'])), wi.js['rtc_cm:com.ibm.team.workitem.linktype.relatedworkitem.related'])) print "Description :" print html2text.html2text(wi.js['dc:description']) comments = wi.get_comments() if len(comments) == 0: return print "Comments :" i = 0 for c in comments: print str(i) + ": " +cl.str(c['dc:creator']['dc:title'], cl.fg.green)+" ("+c['dc:created'] + ") :" print html2text.html2text(c['dc:description']) i = i + 1
def getbody(html): bodytxt=[] try: txt = html2text(html.decode('utf8')) except: txt = html2text(html.decode('gbk')) try: txt = txt.encode('utf8').split('\n') except: txt = txt.encode('gbk').split('\n') for r in txt: r = r.strip() if r == '':continue if r in bodytxt:pass #pass repeat elif r[0] == '[': if len(bodytxt) >= 5:break else:pass elif ']' in r[-4:] and len(bodytxt) < 5:pass #pass writer's name elif r[4:] == '****':break #body end elif r[5:]<chr(127):pass #pass address elif r[0].isdigit() and r[2] > chr(127):bodytxt.append(r) #int+chinese elif r[0] == '#' and r[1] != '#': # title try: if bodytxt[-1][0] == '#':bodytxt=[] else:bodytxt.append(r) except: bodytxt.append(r) elif r[:2] == '**':bodytxt.append(r) # branch title elif r[0] > chr(127):bodytxt.append(r) #add chinese to bodytxt elif r[0] == '#' and len(bodytxt) >= 5:break return '\n'.join(bodytxt)
def all_answers(self): self.answer_num find = self.soup.find_all(class_=' zm-editable-content clearfix', limit=self.num) for index, answer in enumerate(find): print '第%d个答案:\n' % (index+1) print html2text.html2text(str(answer))
def parse_ticket(self, id): # Use CSV export to get ticket fields url = self.full_url(self.TICKET_URL % id, 'csv') f = self.csvopen(url) reader = csv.DictReader(f) ticket_fields = reader.next() ticket_fields['class'] = 'ARTIFACT' ticket = self.remap_fields(ticket_fields) # Use HTML export to get ticket description and comments import html2text html2text.BODY_WIDTH = 0 url = self.full_url(self.TICKET_URL % id) self.log_url(url) d = BeautifulSoup(urlopen(url)) self.clean_missing_wiki_links(d) desc = d.find('div', 'description').find('div', 'searchable') ticket['description'] = html2text.html2text( desc.renderContents('utf8').decode('utf8')) if desc else '' comments = [] for comment in d.findAll('form', action='#comment'): c = {} c['submitter'] = re.sub( r'.* by ', '', comment.find('h3', 'change').text).strip() c['date'] = self.trac2z_date( comment.find('a', 'timeline')['title'].replace(' in Timeline', '')) changes = unicode(comment.find('ul', 'changes') or '') body = comment.find('div', 'comment') body = body.renderContents('utf8').decode('utf8') if body else '' c['comment'] = html2text.html2text(changes + body) c['class'] = 'COMMENT' comments.append(c) ticket['comments'] = comments return ticket
def process_item(self,item,spider): if self.notThisPipeline(spider): return item hxs = HtmlXPathSelector(text=item["raw"]) title=hxs.select("//*[contains(@class, 'ask-title')]/text()") if len(title): item['title']=title.extract()[0] else: raise DropItem() content=hxs.select("//*[contains(@class, 'q-content')]") if len(content): item['content']=html2text.html2text(content[0].extract()) else: item['content']='' best_answer=hxs.select("//*[contains(@class, 'best-text')]") if len(best_answer): item['best_answer']=html2text.html2text(best_answer[0].extract()) else: item['best_answer']="" anss=hxs.select("//*[contains(@class, 'answer-text')]") ext_ans=[] for ans in anss: ext_ans.append(html2text.html2text(ans.extract())) item['answers']=ext_ans return item
def fp_import(request): author = request.user url = Util.objects.get(pk=1).ffnet_url munger = Munger(url, FPAdapter()) imported_story = munger.DownloadStory() imported_story_title = str(imported_story.title) try: story = Post.objects.get(slug=slugify(imported_story_title)) except: story = Post() story.title = imported_story_title story.author = author story.post_type = "story" story.imported = True story.rational = True story.published = True if imported_story.chapters[0].title: story.body = " " else: contents = imported_story.chapters[0].contents contents = html2text(str(contents)) story.body = contents story.save() teststring = "Imported: " + story.title + "<br/>" if imported_story.chapters[0].title: for index, imported_chapter in enumerate(imported_story.chapters): title = imported_chapter.title.split(".",1)[1].strip() # title = story.title + "| Chapter " + str(story.children.count()+1) contents = imported_chapter.contents contents = html2text(str(contents)) try: chapter = Post.objects.get(slug=slugify(title)) except: chapter = Post() chapter.title = title chapter.body = contents chapter.number = index + 1 chapter.author = author chapter.post_type = "chapter" chapter.imported = True chapter.rational = True chapter.parent = story chapter.save() teststring += "Imported: " + chapter.title + "<br/>" return render(request, 'posts/import.html', { 'teststring': teststring, })
def forwards(self, orm): "Write your forwards methods here." # Note: Remember to use orm['appname.ModelName'] rather than "from appname.models..." for row in all_portfolio_rows(): work = orm.Work.objects.get(sidar_id=remove_file_extension(row['Filename'])) work.description_he = html2text(row[u'תאור']).strip() work.description_en = html2text(row[u'Description']).strip() work.save()
def decorated(*args, **kwargs): response = func(*args) try: print '%s %s: %s' % (kwargs.get('label', 'Node'), statuses[response[0]['status']], args[0]) except: try: print html2text(response[1]).replace('\n\n', '\n') except: print response[1]
def notify(self, subject, message): ''' Output the subject and message, after converting the message html to markdown text. ''' print " == " + subject + " == \n" print html2text.html2text(message)
def get_weather(): import html2text import urllib2 max_temp = html2text.html2text(urllib2.urlopen('http://weather.sun.ac.za/api/getlivedata.php?maxtemp').read()) cur_temp = html2text.html2text(urllib2.urlopen('http://weather.sun.ac.za/api/getlivedata.php?temperature').read()) return max_temp, cur_temp
def copy_data(target, data): copy_seo_data(target, data) translation.activate('en') target.content = html2text.html2text(data.text_en) translation.deactivate() translation.activate('fr') target.content = html2text.html2text(data.text_fr) translation.deactivate()
def import_page(name, path): with open(os.path.join(path, name, 'data'), 'rb') as f: data = pickle.load(f) def copy_data(target, data): copy_seo_data(target, data) translation.activate('en') target.content = html2text.html2text(data.text_en) translation.deactivate() translation.activate('fr') target.content = html2text.html2text(data.text_fr) translation.deactivate() if isinstance(data, models.HomePage): hp = HomePage.get_solo() copy_data(hp, data) push = HomePagePush() push.home_page = hp translation.activate('en') push.title = data.push_title_en push.content = html2text.html2text(data.push_content_en) translation.deactivate() translation.activate('fr') push.title = data.push_title_fr push.content = html2text.html2text(data.push_content_fr) translation.deactivate() push.save() hp.save() else: p = Page() p.slug = slugify(data.title_en.lower()) copy_data(p, data) translation.activate('en') p.title = data.title_en translation.deactivate() translation.activate('fr') p.title = data.title_fr translation.deactivate() if hasattr(data, 'background'): img = make_master_image( path, data.background, '{}-background'.format(data.title_en) ) p.background = img p.save()
def parse_account_statement(self,html): ret = [] soup = BeautifulSoup(html) for table in soup.findAll('table',{'class':'tableRd'}): rec = {} for row in table.findAll('tr'): cols = row.findAll('td') rec.update( { html2text.html2text(cols[0].find(text=True)).strip('\n:') : html2text.html2text(cols[1].find(text=True)).strip('\n:') }) ret.append(rec) return ret
def process(self, item): ods_dataset = item.kwargs["dataset"] dataset_id = ods_dataset["datasetid"] ods_metadata = ods_dataset["metas"] if not ods_dataset.get('has_records'): msg = 'Dataset {datasetid} has no record'.format(**ods_dataset) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) dataset.title = ods_metadata['title'] dataset.frequency = "unknown" description = ods_metadata.get("description", '').strip() description = html2text.html2text(description.strip('\n').strip()) dataset.description = html2text.html2text(description).strip('\n') dataset.private = False tags = set() if "keyword" in ods_metadata: if isinstance(ods_metadata['keyword'], list): tags |= set(ods_metadata['keyword']) else: tags.add(ods_metadata['keyword']) if "theme" in ods_metadata: if isinstance(ods_metadata["theme"], list): for theme in ods_metadata["theme"]: tags.update([t.strip().lower() for t in theme.split(",")]) else: themes = ods_metadata["theme"].split(",") tags.update([t.strip().lower() for t in themes]) dataset.tags = list(tags) ods_license_id = ods_metadata.get('license') if ods_license_id and ods_license_id in self.LICENSES: license_id = self.LICENSES[ods_license_id] dataset.license = License.objects.get(id=license_id) dataset.resources = [] self.process_resources(dataset, ods_dataset, ('csv', 'json')) if 'geo' in ods_dataset['features']: self.process_resources(dataset, ods_dataset, ('geojson', 'shp')) dataset.extras["ods:url"] = self._get_explore_url(dataset_id) if "references" in ods_metadata: dataset.extras["ods:references"] = ods_metadata["references"] dataset.extras["ods:has_records"] = ods_dataset["has_records"] return dataset
def parse_accounts(self,html): soup = BeautifulSoup(html) tables = soup.findAll('table',{'class':'tabdtl'}) ret=[] for table in tables: rec = {} rows = table.findAll('tr') for row in rows: cols = row.findAll('td') rec.update( { html2text.html2text(cols[0].find(text=True)).strip('\n:') : html2text.html2text(cols[1].find(text=True)).strip('\n:') }) ret.append(rec) return ret
def form_valid(self, form): # save the result data = DynamicFormData.objects.create( dynamicform = self.dynamicform, raw_post_data = self.request.raw_post_data, headers = '\n'.join( '%s: %s' % (h, self.request.META[h]) for h in HTTP_HEADERS if h in self.request.META ) ) # create confirmation e-mail if self.dynamicform.send_confirmation: recipients_template = Template(self.dynamicform.email_recipients) subject_template = Template(self.dynamicform.email_subject) content_template = Template(self.dynamicform.email_content) context = Context(form.cleaned_data) recipients = recipients_template.render(context) subject = subject_template.render(context) content = content_template.render(context) msg = EmailMultiAlternatives( force_unicode(subject), html2text(content), settings.DEFAULT_FROM_EMAIL, [address for name, address in rfc822.AddressList(recipients).addresslist], ) msg.attach_alternative(content, "text/html") msg.send() # create e-mail for dynamicform manager if self.dynamicform.notification_emails: recipients = self.dynamicform.notification_emails.split(u',') subject = _(u'Someone filled in your online form "%s"') % self.dynamicform.name context = RequestContext( self.request, { 'form': form, 'dynamicform': self.dynamicform, 'dynamicformdata': data, 'site': Site.objects.get_current(), }, ) content = render_to_string(self.dynamicform.email_template, context_instance=context) msg = EmailMultiAlternatives( force_unicode(subject), html2text(content), settings.SERVER_EMAIL, [address for name, address in rfc822.AddressList(recipients).addresslist], ) msg.attach_alternative(content, "text/html") msg.send() return super(ProcessDynamicFormView, self).form_valid(form)
def _getPostInfo(self, value): if self.__dbg__: print self.__plugin__ + " _getPostInfo: " + value[1] try: post = {} post['url'] = value[0] post['raw_title'] = value[1] post['img'] = value[2] #raw_descr = value[3] raw_descr = "" """ # try to extract info from descr release_name = re.compile("release name:</strong>(.+?)<", re.IGNORECASE).findall(raw_descr) for tmp in release_name: post['release_name'] = tmp plot += "Release name: "+ tmp + "\n" tmps = re.compile("audio quality:</strong>(.+?)<", re.IGNORECASE).findall(raw_descr) for tmp in tmps: post['audio_quality'] = tmp plot += tmp + "\n" tmps = re.compile("video quality:</strong>(.+?)<", re.IGNORECASE).findall(raw_descr) for tmp in tmps: post['video_quality'] = tmp plot += tmp + "\n" tmps = re.compile("size:</strong>(.+?)<", re.IGNORECASE).findall(raw_descr) for tmp in tmps: post['size_str'] = tmp plot += tmp + "\n" """ # fill video specific fields, used by info dialog post['Title'] = html2text.html2text(post['raw_title'].decode('utf-8')) # use multiple lines for multiple titles post['Title'] = post['Title'].replace(" & "," &\n") #for now, just show raw_descr post['Plot'] = html2text.html2text(raw_descr.decode('utf-8')) # per default, there is always a next page (if not, will be set in parent function) post['next'] = "true" if self.__dbg__: print self.__plugin__ + " _getPostInfo done" return post; except: self._exception("_getPostInfo") return ( dict(), 500 )
def _get_entry_title(self, entry): if hasattr(entry, 'title_detail') and entry.title_detail: title = entry.title_detail.value if 'html' in entry.title_detail.type: title = _html2text.html2text(title) else: content = self._get_entry_content(entry) value = content['value'] if content['type'] in ('text/html', 'application/xhtml+xml'): value = _html2text.html2text(value) title = value[:70] title = title.replace('\n', ' ').strip() return title
def selectValidCVEContent(self, soup): lis = soup.find_all('li') for li in lis: validCVEContent = html2text.html2text(li.get_text()) if (validCVEContent.find("CVE ID:") != -1): return validCVEContent # deal with the no <li> tag for CVE ID case vulTxtContent = html2text.html2text(soup.get_text()) startPos = vulTxtContent.find("CVE ID:") if (startPos != -1): return vulTxtContent[startPos:] else: return ''
def show(self): # pylint: disable=no-self-use terms = sync_wait(self._call("show")) return html2text.html2text(terms)
def get_email_message(self, template_name, context, from_email=None, to=None, cc=None, bcc=None, headers=None, template_prefix=None, template_suffix=None, template_dir=None, file_extension=None, attachments=None, create_link=False): if create_link: email_uuid = uuid.uuid4() link_context = dict(context) context['email_uuid'] = email_uuid.hex for key, value in context.items(): if isinstance(value, InlineImage): link_context[key] = self.host_inline_image(value) EmailMessage = get_emailmessage_klass() EmailMultiAlternatives = get_emailmultialternatives_klass() parts = self._render_email(template_name, context, template_prefix or template_dir, template_suffix or file_extension) plain_part = 'plain' in parts html_part = 'html' in parts if create_link and html_part: static_html_part = self._render_email( template_name, link_context, template_prefix or template_dir, template_suffix or file_extension)['html'] from templated_email.models import SavedEmail SavedEmail.objects.create(content=static_html_part, uuid=email_uuid) if 'subject' in parts: subject = parts['subject'] else: subject_dict = getattr(settings, 'TEMPLATED_EMAIL_DJANGO_SUBJECTS', {}) if isinstance(template_name, (list, tuple)): for template in template_name: if template in subject_dict: subject_template = subject_dict[template] break else: subject_template = _('%s email subject' % template_name[0]) else: subject_template = subject_dict.get( template_name, _('%s email subject' % template_name)) subject = subject_template % context subject = subject.strip('\n\r') # strip newlines from subject if html_part and not plain_part and html2text and \ getattr(settings, 'TEMPLATED_EMAIL_AUTO_PLAIN', True): parts['plain'] = html2text.html2text(parts['html']) plain_part = True if plain_part and not html_part: e = EmailMessage( subject, parts['plain'], from_email, to, cc=cc, bcc=bcc, headers=headers, attachments=attachments, ) if html_part and not plain_part: e = EmailMessage( subject, parts['html'], from_email, to, cc=cc, bcc=bcc, headers=headers, attachments=attachments, ) e.content_subtype = 'html' if plain_part and html_part: e = EmailMultiAlternatives( subject, parts['plain'], from_email, to, cc=cc, bcc=bcc, headers=headers, attachments=attachments, ) e.attach_alternative(parts['html'], 'text/html') self.attach_inline_images(e, context) return e
def _get_signature(self): user = self.env.user signature = html2text.html2text(user.signature) return signature
def parse_author(response): return response.xpath('//div[@class="left_news"]/div[@class="page_town_row"][last()]/text()').get() or \ response.xpath('//div[@class="left_news"]/div[@class="page_town_row"]/p/strong/em/text()').get() or \ response.xpath('//div[@class="left_news"]/div[last()]/p[last()]/text()').get() or html2text(response.xpath( '//div[@class="left_news"]//p[@class="post-source"]').get())
text2 = "" try: try: text = msg.get_body( preferencelist=('plain')).get_content() # čistý text #print("metoda 1") except: if msg.is_multipart(): for payload in msg.get_payload(): #print("metoda 2a") # if payload.is_multipart(): ... text2 = payload.get_payload() else: text2 = msg.get_payload() #print("metoda 2b") text = html2text.html2text(text2) except: text = "" #print(text2) text = text.replace('\n', ' ') #print(text) odesilatel = msg['from'] # odesílatel prijemce = msg['to'] # příjemce predmet = msg['subject'] # předmět datum = msg['date'] # datum #print ('To: %s' % prijemce) # příjemce #print ('From: %s' % odesilatel) # odesílatel #print ('Subject: %s' % predmet) # předmět #print ('Date: %s' % datum) # datum #print(text) # čistý text #print(msg) # celý text
def create_email(sender_name, sender_email, inbox_uid, to_addr, cc_addr, bcc_addr, subject, html, in_reply_to, references, attachments): """ Creates a MIME email message (both body and sets the needed headers). Parameters ---------- sender_name: string The name aka phrase of the sender. sender_email: string The sender's email address. to_addr, cc_addr, bcc_addr: list of pairs (name, email_address), or None Message recipients. subject : string a utf-8 encoded string html : string a utf-8 encoded string in_reply_to: string or None If this message is a reply, the Message-Id of the message being replied to. references: list or None If this message is a reply, the Message-Ids of prior messages in the thread. attachments: list of dicts, optional a list of dicts(filename, data, content_type) """ html = html if html else '' plaintext = html2text(html) # Create a multipart/alternative message msg = mime.create.multipart('alternative') msg.append(mime.create.text('plain', plaintext), mime.create.text('html', html)) # Create an outer multipart/mixed message if attachments: text_msg = msg msg = mime.create.multipart('mixed') # The first part is the multipart/alternative text part msg.append(text_msg) # The subsequent parts are the attachment parts for a in attachments: # Disposition should be inline if we add Content-ID msg.append( mime.create.attachment(a['content_type'], a['data'], filename=a['filename'], disposition='attachment')) msg.headers['Subject'] = subject if subject else '' # Gmail sets the From: header to the default sending account. We can # however set our own custom phrase i.e. the name that appears next to the # email address (useful if the user has multiple aliases and wants to # specify which to send as), see: http://lee-phillips.org/gmailRewriting/ # For other providers, we simply use name = '' from_addr = address.EmailAddress(sender_name, sender_email) msg.headers['From'] = from_addr.full_spec() # Need to set these headers so recipients know we sent the email to them # TODO(emfree): should these really be unicode? if to_addr: full_to_specs = [ address.EmailAddress(name, spec).full_spec() for name, spec in to_addr ] msg.headers['To'] = u', '.join(full_to_specs) if cc_addr: full_cc_specs = [ address.EmailAddress(name, spec).full_spec() for name, spec in cc_addr ] msg.headers['Cc'] = u', '.join(full_cc_specs) if bcc_addr: full_bcc_specs = [ address.EmailAddress(name, spec).full_spec() for name, spec in bcc_addr ] msg.headers['Bcc'] = u', '.join(full_bcc_specs) add_inbox_headers(msg, inbox_uid) if in_reply_to: msg.headers['In-Reply-To'] = in_reply_to if references: msg.headers['References'] = '\t'.join(references) rfcmsg = _rfc_transform(msg) return rfcmsg
def deploy(): db.drop_all() db.create_all() users = [('*****@*****.**', u'知乎小管家', 'password'), ('*****@*****.**', u'Jack', 'password'), ('*****@*****.**', u'Jim', 'password'), ('*****@*****.**', u'麻花疼', 'password'), ('*****@*****.**', u'丁磊', 'password'), ('*****@*****.**', u'张家玮', 'password'), ('*****@*****.**', u'李开复', 'password'), ('*****@*****.**', u'张小北', 'password'), ('*****@*****.**', u'采铜', 'password'), ('*****@*****.**', u'张亮', 'password'), ('*****@*****.**', u'周晓农', 'password'), ('*****@*****.**', u'李楠', 'password'), ('*****@*****.**', u'马伯庸', 'password'), ('*****@*****.**', u'笑道人', 'password'), ('*****@*****.**', u'谢熊猫君', 'password')] for user in users: u = User(email=user[0], nickname=user[1], password=user[2]) u.username = create_username(u.nickname) db.session.add(u) db.session.commit() users = User.query.all() for user in users: other_users = users[:] other_users.remove(user) user2 = choice(other_users) user.follow_user(user2) db.session.commit() with open('zhihu_questions.pk', 'rb') as f: infos = pk.load(f) users = User.query.all() i = 0 while i < len(infos): q_html = infos[i]['detail'] title = infos[i]['title'] try: if i == 0: question = Question(user=users[0], title=title, content=html2text(q_html), content_html=q_html) db.session.add(question) db.session.commit a_html = infos[i]['answers'][0] answer = Answer(author=users[0], question=question, content=html2text(a_html), content_html=a_html) db.session.add(answer) db.session.commit() feed1 = Feed(user=users[0], action="ask_question", question=question) feed2 = Feed(user=users[0], action="answer_question", question=question, answer=answer) db.session.add_all([feed1, feed2]) db.session.commit() else: q_html = infos[i]['detail'] title = infos[i]['title'] prev_question = Question.query.order_by( Question.id.desc()).first() id_plus = randint(1, 4) question_id = prev_question.id + id_plus asker = choice(users) question = Question(id=question_id, user=asker, title=title, content=html2text(q_html), content_html=q_html) db.session.add(question) db.session.commit() feed1 = Feed(user=asker, action="ask_question", question=question) db.session.add(feed1) db.session.commit() answerers = users[:] j = 0 while j < len(infos[i]['answers']): answerer = choice(answerers) a_html = infos[i]['answers'][j] answer = Answer(author=answerer, question=question, content=html2text(a_html), content_html=a_html) db.session.add(answer) db.session.commit() feed2 = Feed(user=answerer, action="answer_question", question=question, answer=answer) db.session.add(feed2) db.session.commit() answerers.remove(answerer) j += 1 except Exception: continue i += 1 print(u'第%s个问题已收录' % i)
def get_text(): page_response = requests.get(page_link, timeout=5) page_content = BeautifulSoup(page_response.content, "html.parser") text = str(page_content) raw_text = html2text.html2text(text) return (raw_text)
def content(self): content = html2text.html2text(self.html) # content = content.replace('\n', '<br />') return content
def load_newsletter_html(): """Loads html newsletter and returns a html file and a text file to be used by email service""" with codecs.open("app/templates/newsletter.html", 'r', 'utf-8') as f: html = f.read() text = html2text.html2text(html) return text, html
def html2markdown(text): """ Convert html text to markdown and replace special unicode characters """ return remove_unicode(html2text.html2text(text.decode('utf-8')))
def get_findings(self, filename, test): content = filename.read() if type(content) is bytes: content = content.decode('utf-8') raw_data = content.replace("aws_info =", "") data = json.loads(raw_data) find_date = datetime.now() dupes = {} test_description = "" aws_account_id = data["aws_account_id"] test_description = "%s **AWS Account:** %s\n" % (test_description, aws_account_id) last_run = data["last_run"] test_description = "%s **Ruleset:** %s\n" % (test_description, last_run["ruleset_name"]) test_description = "%s **Ruleset Description:** %s\n" % ( test_description, last_run["ruleset_about"]) test_description = "%s **Command:** %s\n" % (test_description, last_run["cmd"]) # Summary for AWS Services test_description = "%s\n**AWS Services** \n\n" % (test_description) for service, items in list(last_run["summary"].items()): test_description = "%s\n**%s** \n" % (test_description, service.upper()) test_description = "%s\n* **Checked Items:** %s\n" % ( test_description, items["checked_items"]) test_description = "%s* **Flagged Items:** %s\n" % ( test_description, items["flagged_items"]) test_description = "%s* **Max Level:** %s\n" % (test_description, items["max_level"]) test_description = "%s* **Resource Count:** %s\n" % ( test_description, items["resources_count"]) test_description = "%s* **Rules Count:** %s\n\n" % ( test_description, items["rules_count"]) test.description = test_description test.save() scout2_findings = [] # Configured AWS Services for service in list(data["services"].items()): for service_item in service: if "findings" in service_item: for name, finding in list( service_item["findings"].items()): if finding["items"]: description_text = "" for name in finding["items"]: description_text = description_text + "**Location:** " + name + "\n\n---\n" description_text = description_text + "\n" key = name.split('.') i = 1 lookup = service_item while i < len(key): if key[i] in lookup: if (type(lookup[key[i]]) is dict): lookup = lookup[key[i]] if (key[i - 1] == "security_groups" or key[i - 1] == "PolicyDocument"): break i = i + 1 self.recursive_print(lookup) description_text = description_text + self.item_data self.item_data = "" mobsf_item = { "category": "Mobile Permissions", "title": finding["description"], "severity": finding["level"], "description": description_text } scout2_findings.append(mobsf_item) for scout2_finding in scout2_findings: title = html2text(scout2_finding["title"]) sev = self.getCriticalityRating(scout2_finding["severity"]) description = scout2_finding["description"] dupe_key = sev + title if dupe_key in dupes: find = dupes[dupe_key] if description is not None: find.description += description else: find = Finding( title=textwrap.shorten(title, 150), cwe= 1032, # Security Configuration Weaknesses, would like to fine tune test=test, description="**AWS Account:** " + aws_account_id + "\n" + description, severity=sev, references=None, date=find_date, dynamic_finding=True) dupes[dupe_key] = find return list(dupes.values())
def fetchMd(self): return html2text.html2text(self.fetchHtml())
def readinput2(input_file, DescIdx): # Demographic keywords Demographics = [ "marital", "married", "unmarried", "single", "separated", "engaged", "divorced", "widowed", "widow", "widower", "domestic partnership", "unmarried partner", "cohabiting", "civil union", "education", "graduate", "occupation", "birthplace", "salary", "relationship" ] DemographicsIS = ["single", "separated"] # Keep SNOMED-CT CUI finding codes SNOMED = {} #SNOMED_source = '/home/sondoan/pfindr/VariableStandadization/phen-classification/data/MRCONSO_SNOMEDCT_finding_unique.ID' SNOMED_source = '/home/sodoan/pfindr/Normalization/Abbreviation/MRCONSO_SNOMEDCT_finding_unique.ID' f1 = open(SNOMED_source, 'r') for item1 in f1.readlines(): if len(item1.strip()) > 0: SNOMED[item1.strip()] = 1 f1.close() fin = open(input_file, 'r') for items in fin.readlines(): item = items.split(':::') #PhenDesc = item[0].strip() # Default DescIdx is 1, is the index of phenotype description # DescIdx = 1 PhenDesc = item[DescIdx].strip() if PhenDesc.find('<a href>') >= 0: PhenDesc = html2text.html2text(PhenDesc) Theme = item[DescIdx + 1].strip() ThemePCN = item[DescIdx + 2] ThemeCUI = item[DescIdx + 3] ThemeSem = item[DescIdx + 4] TopicPCN = item[DescIdx + 5] TopicCUI = item[DescIdx + 6] TopicSem = item[DescIdx + 7] SOIPCN = item[DescIdx + 8] SOICUI = item[DescIdx + 9] SOISem = item[DescIdx + 10] # ========================================== # Remove LOINC code from TopicPCN TopicPCNL1 = TopicPCN.split(';') TopicCUIL1 = TopicCUI.split(';') TopicSemL1 = TopicSem.split(';') tempPCN = '' tempCUI = '' tempSem = '' iPCN = 0 for iTopic in TopicPCNL1: #print iPCN #print len(iTopic.strip()) if not mapLOINC(iTopic) and len(iTopic.strip()) > 0: tempPCN += iTopic + ';' tempCUI += TopicCUIL1[iPCN] + ';' tempSem += TopicSemL1[iPCN] + ';' #print tempPCN #print tempCUI iPCN += 1 TopicPCN = tempPCN.strip(';') TopicCUI = tempCUI.strip(';') TopicSem = tempSem.strip(';') # ========================================== # Remove Excluded CUIs CUIEx = [ "C0555047", "C0087136", "C1549113", "C0682073", "C0086170", "C0206275", "C0425152", "C0425164", "C0682187", "C0013658", "C0337664", "C0337676", "C0337677", "C0337679", "C0560184", "C2699517", "C1558950", "C0579133", "C0750479", "C0238884", "C1550043", "C0449255", "C2053594", "C0011900", "C1299586", "C1704632", "C0518459", "C0013798", "C0849912", "C0476610", "C1287845", "C2825142", "C1832071", "C0016928", "C0518461", "C1832073", "C2004062", "C1444656", "C0496675", "C0262926", "C1657765", "C0240320", "C2970713", "C1820407", "C1444648", "C1955473", "C1509143", "C0516977", "C1514241", "C0848632", "C1705236", "C1705179", "C2826292", "C2826292", "C0871269", "C0518462", "C0449416", "C1301826", "C0429103", "C0427693", "C1363945", "C0040210", "C1299582", "C1273517", "C0439540", "C1444647", "C0234766", "C2926735", "C0682295" ] # List of PRODUCT names CUIEx1 = [ 'C2926735', 'C0308903', 'C0308902', 'C0310197', 'C0722923', 'C2348077' ] TopicPCNL1 = TopicPCN.split(';') TopicCUIL1 = TopicCUI.split(';') TopicSemL1 = TopicSem.split(';') iPCN = 0 for iCUI in TopicCUIL1: if mapList(iCUI, CUIEx1): TopicPCNL1.remove(TopicPCNL1[iPCN]) TopicCUIL1.remove(iCUI) TopicSemL1.remove(TopicSemL1[iPCN]) iPCN += 1 TopicPCN = ';'.join(TopicPCNL1[0:]).strip() TopicCUI = ';'.join(TopicCUIL1[0:]).strip() TopicSem = ';'.join(TopicSemL1[0:]).strip() # ========================================== # Keep SNOMED-CT finding only -- Filter by SNOMED CT, just keep finding in SNOMED-CT from UMLS findings # Keep Topic TopicPCNL1 = TopicPCN.split(';') TopicCUIL1 = TopicCUI.split(';') TopicSemL1 = TopicSem.split(';') iPCN = 0 for iCUI in TopicSemL1: if iCUI.find('fndg') >= 0: # Check if it exists in SNOMED List if not SNOMED.has_key(TopicCUIL1[iPCN].strip()): # Remove item from the list TopicSemL1.remove(iCUI) TopicCUIL1.remove(TopicCUIL1[iPCN]) TopicPCNL1.remove(TopicPCNL1[iPCN]) iPCN += 1 TopicPCN = ';'.join(TopicPCNL1[0:]).strip() TopicCUI = ';'.join(TopicCUIL1[0:]).strip() TopicSem = ';'.join(TopicSemL1[0:]).strip() # ---------------------------- phenCategory = [] # RULE STARTING ### Type: Demographic #if mapList(PhenDesc.lower(),Demographics) or len(Theme)>0: #if Theme!='NULL' and len(ThemePCN)>0: # if not 'Demographics' in phenCategory: # phenCategory.append('Demographics') # Modified on July 11, 2014 Patient = ['C0030705', 'C0679646', 'C0681850'] if Theme != 'NULL' and len(ThemePCN) > 0: if mapList(SOICUI, Patient): phenCategory.append('Demographics Patient') else: phenCategory.append('Demographics Family') ### Type: Medication MedPatient = ['C0030705', 'C0679646', 'C0681850'] if mapList(SOICUI, MedPatient) and (mapList(TopicSem, ['phsu']) or PhenDesc.lower().find('medication') >= 0): if not 'Medication Patient' in phenCategory: phenCategory.append('Medication Patient') if not mapList(SOICUI, MedPatient) and ( mapList(TopicSem, ['phsu']) or PhenDesc.lower().find('medication') >= 0): if not 'Medication Family' in phenCategory: phenCategory.append('Medication Family') ### Type: Lab Test LabTest = ['C0030705', 'C0679646', 'C0681850'] if mapList(SOICUI, LabTest) and mapList(TopicSem, ['lbpr']): if not 'Lab Tests Patient' in phenCategory: phenCategory.append('Lab Tests Patient') if not mapList(SOICUI, LabTest) and mapList(TopicSem, ['lbpr']): if not 'Lab Tests Family' in phenCategory: phenCategory.append('Lab Tests Family') ### Type: Mental or Emotional Finding MentalFinding = ['menp'] if mapList(SOICUI, LabTest) and mapList(TopicSem, MentalFinding): if not 'Mental or Emotional Finding' in phenCategory: phenCategory.append('Mental or Emotional Finding') if not mapList(SOICUI, LabTest) and mapList(TopicSem, MentalFinding): if not 'Mental or Emotional Finding Family' in phenCategory: phenCategory.append('Mental or Emotional Finding Family') ### Type: Smoking History SmokingHistory = [ "smoke", "smoking", "smoker", "tobacco", "cigarette", "pipe", "cigar", "nicotine" ] if mapList(SOICUI, LabTest) and mapList(PhenDesc.lower(), SmokingHistory): if not 'Smoking History' in phenCategory: phenCategory.append('Smoking History') if not mapList(SOICUI, LabTest) and mapList(PhenDesc.lower(), SmokingHistory): if not 'Smoking History Family' in phenCategory: phenCategory.append('Smoking History Family') ### Type: Drinking History DrinkingEx = ["C0337676", "C0337677", "C0337679"] DrinkingHistory = [ "drink", "drinker", "alcohol", "liquor", "drunk", "beer", "wine", "drinking" ] if mapList( SOICUI, LabTest) and PhenDesc.lower().find('drinking function') == -1: if mapList(PhenDesc.lower(), DrinkingHistory) and not mapList(TopicCUI, DrinkingEx): if not 'Drinking History' in phenCategory: phenCategory.append('Drinking History') if not mapList(SOICUI, LabTest) and PhenDesc.lower().find( 'drinking function') == -1: if mapList(PhenDesc.lower(), DrinkingHistory) and not mapList(TopicCUI, DrinkingEx): if not 'Drinking History Family' in phenCategory: phenCategory.append('Drinking History Family') ### Type: Substance Use History SubstanceUseHistory = [ "cocaine", "opiate", "stimulant", "marijuana", "pot", "cannabis" ] ExSubstance = [ 'smoke', 'smoking', 'smoker', 'tobacco', 'cigarette', 'pipe', 'cigar', 'nicotine' ] if mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), SubstanceUseHistory) or mapList( TopicSem, ['hops']): if not mapList(PhenDesc, ExSubstance): if not 'Substance Use History' in phenCategory: phenCategory.append('Substance Use History') if not mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), SubstanceUseHistory) or mapList( TopicSem, ['hops']): if not mapList(PhenDesc, ExSubstance): if not 'Substance Use History Family' in phenCategory: phenCategory.append('Substance Use History Family') ### Type: Eating or Nutritional Finding Eating = ["food", "vitamin", "nutrition", "water"] if mapList(SOICUI, LabTest): if mapList(TopicSem, ['food']) or mapList(PhenDesc.lower(), Eating): if not 'Eating or Nutritional Finding' in phenCategory: phenCategory.append('Eating or Nutritional Finding') if not mapList(SOICUI, LabTest): if mapList(TopicSem, ['food']) or mapList(PhenDesc.lower(), Eating): if not 'Eating or Nutritional Finding Family' in phenCategory: phenCategory.append('Eating or Nutritional Finding Family') ### Type: Self-care Status Selfcare = [ "selfcare", "self care", "self-care", "dressing", "grooming", "bathing", "eating", "toileting", "hygiene" ] if mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), Selfcare): if not 'Self-care Status' in phenCategory: phenCategory.append('Self-care Status') if not mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), Selfcare): if not 'Self-care Status' in phenCategory: phenCategory.append('Self-care Status Family') ### Type: Healthcare Activity Finding Healthcare = [ "medical care", "hospital", "appointment", "follow up", "f/u", "follow-up", "visit", "encounter", "service" ] #if mapList(SOICUI,LabTest): # if mapList(TopicSem,['hlca']) or mapList(PhenDesc.lower(),Healthcare): # if not 'Healthcare Activity Finding' in phenCategory: # phenCategory.append('Healthcare Activity Finding') #if not mapList(SOICUI,LabTest): # if mapList(TopicSem,['hlca']) or mapList(PhenDesc.lower(),Healthcare): # if not 'Healthcare Activity Finding Family' in phenCategory: # phenCategory.append('Healthcare Activity Finding Family') if mapList(PhenDesc.lower(), Healthcare): if not 'Healthcare Encounter' in phenCategory: phenCategory.append('Healthcare Encounter') ## Type: Therapeutic or Preventive Procedure if mapList(SOICUI, LabTest) and mapList(TopicSem, ['topp']): if not 'Therapeutic or Preventive Procedure' in phenCategory: phenCategory.append('Therapeutic or Preventive Procedure') if not mapList(SOICUI, LabTest) and mapList(TopicSem, ['topp']): if not 'Therapeutic or Preventive Procedure Family' in phenCategory: phenCategory.append( 'Therapeutic or Preventive Procedure Family') ### Type: Clinical Attributes ClinicalAttL = [ "gestational age", "basal metabolic rate", "body surface area", "blood pressure", "body mass index", "body weight", "diastolic blood pressure", "heart rate", "height", "respiration rate", "systolic blood pressure", "temperature", "temperature, pulse, respiration", "weight", "vital sign", "body temperature", "pulse rate", "systolic pressure", "diastolic pressure", "resting pressure", "pulse pressure", "heartbeat", "birth weight", "body fat distribution", "adiposity", "waist circumference", "waist-hip ratio", "head circumference", "chest circumference", "pulse", "respiratory depth", "pulse deficit", "pain", "oxygen saturation", "pupil size", "pupil equality", "pupil reactivity to light", "pulse oximetry", "diameter", "perimeter", "systolic", "diastolic", "visual acuity" ] if mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), ClinicalAttL) and not mapList( PhenDesc.lower(), ['weighting', 'weighted']): if not 'Clinical Attributes' in phenCategory: phenCategory.append('Clinical Attributes') if not mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), ClinicalAttL): if not 'Clinical Attributes Family' in phenCategory and not mapList( PhenDesc.lower(), ['weighting', 'weighted']): phenCategory.append('Clinical Attributes Family') ### Type: Research Attributes ResearchTerms = [ "control group", "control status", "case", "case control", "case-control", "protocol" ] if mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), ResearchTerms) or mapList( TopicSem, ['resa']): if not 'Research Attributes' in phenCategory: phenCategory.append('Research Attributes') if not mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), ResearchTerms) or mapList( TopicSem, ['resa']): if not 'Research Attributes Family' in phenCategory: phenCategory.append('Research Attributes Family') ## REMOVE CO-OCCURENCE Types, e.g., Daily or Recreation Activity doesnot occurs with Clinical Attributes, Lab Test, Diagnostic Procedure # If Diagnostic Procedure co-occurs with Clinical Attribute, then ignore. ### Type: Diagnostic Procedure Diagnosis = [ 'ecg', 'electrocardiogram', 't wave', 't-wave', 'wave feature', 'qrs', 'rr interval', 'r wave', 'p wave', 'q duration', 's wave' ] if mapList(SOICUI, LabTest): if mapList(TopicSem, ['diap']) or mapList(PhenDesc.lower(), Diagnosis): if not 'Clinical Attributes' in phenCategory and not 'Clinical Attributes Family' in phenCategory: if not 'Diagnostic Procedure' in phenCategory: phenCategory.append('Diagnostic Procedure') if not mapList(SOICUI, LabTest): if mapList(TopicSem, ['diap']) or mapList(PhenDesc.lower(), Diagnosis): if not 'Clinical Attributes' in phenCategory and not 'Clinical Attributes Family' in phenCategory: if not 'Diagnostic Procedure Family' in phenCategory: phenCategory.append('Diagnostic Procedure Family') ### Type: Daily or Recreational Activity Activity = [ "gait", "walking", "exercise", "sport", "workout", "gambling", "sleep", "toilet", "chore", "stand", "eat out" ] if mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), Activity) or mapList( TopicSem, ['dora']): if not 'Clinical Attributes' in phenCategory and not 'Clinical Attributes Family' in phenCategory and not 'Lab Test' in phenCategory and not 'Lab Test Family' in phenCategory and not 'Diagnostic Procedure' in phenCategory and not 'Diagnostic Procedure Family' in phenCategory: if not 'Daily or Recreational Activity' in phenCategory: phenCategory.append('Daily or Recreational Activity') if not mapList(SOICUI, LabTest): if mapList(PhenDesc.lower(), Activity) or mapList( TopicSem, ['dora']): if not 'Clinical Attributes' in phenCategory and not 'Clinical Attributes Family' in phenCategory and not 'Lab Test' in phenCategory and not 'Lab Test Family' in phenCategory and not 'Diagnostic Procedure' in phenCategory and not 'Diagnostic Procedure Family' in phenCategory: if not 'Daily or Recreational Activity Family' in phenCategory: phenCategory.append( 'Daily or Recreational Activity Family') # IF Medical History co-occurs with any of (Daily or Recreational Activity, Eating or Nutritional Finding, Drinking History) then ignore (i.e., drop Medical History from the assigned types). ### Type: Medical History MedHist = [ 'dsyn', 'neop', 'sosy', 'acab', 'anab', 'biof', 'cgab', 'fndg', 'inpo', 'orgf', 'patf', 'phsf', 'mobd' ] TopicL1 = TopicCUI.split(';') TopicS1 = TopicSem.split(';') idx1 = 0 #print TopicS1 for iTopic in TopicL1: if mapList(SOICUI, LabTest) and mapList(TopicS1[idx1], MedHist): if not 'Daily or Recreational Activity' in phenCategory and not 'Daily or Recreational Activity Family' in phenCategory and not 'Eating or Nutritional Finding' in phenCategory and not 'Eating or Nutritional Finding Family' in phenCategory and not 'Drinking History' in phenCategory and not 'Drinking History Family' in phenCategory: if not 'Medical History' in phenCategory: phenCategory.append('Medical History') if not mapList(SOICUI, LabTest) and mapList( TopicS1[idx1], MedHist): if not 'Daily or Recreational Activity' in phenCategory and not 'Daily or Recreational Activity Family' in phenCategory and not 'Eating or Nutritional Finding' in phenCategory and not 'Eating or Nutritional Finding Family' in phenCategory and not 'Drinking History' in phenCategory and not 'Drinking History Family' in phenCategory: if not 'Medical History Family' in phenCategory: phenCategory.append('Medical History Family') idx1 += 1 # End the rules # ========================================================================= # PRINT OUT THE MAPPING phenCatStr = ';'.join(phenCategory[0:]) #print item #print "=======" #print phenCatStr #print "=======" # Print to Excel file ExcelOut = '\t'.join(item[0:]).strip() + '\t' + phenCatStr # Print to text file #ExcelOut = ':::'.join(item[0:]).strip() + ':::' + TopicPCN + ':::' + TopicCUI + ':::' + TopicSem + ':::' + phenCatStr print ExcelOut fin.close()
def hash(self, url, title, html): if html is None: html = "" domain = self.urlParser.getDomain(url) text = str(domain) + "_" + str(title) + "_" + html2text.html2text(html) return md5(text)
if r.get('QW_HTML'): try: template = Template(r.get('QW_HTML')) except: print( "Error: Cannot parse quarantine template, falling back to default template." ) with open('/templates/quota.tpl') as file_: template = Template(file_.read()) else: with open('/templates/quota.tpl') as file_: template = Template(file_.read()) html = template.render(username=username, percent=percent) text = html2text.html2text(html) try: msg = MIMEMultipart('alternative') msg['From'] = r.get('QW_SENDER') or "quota-warning@localhost" msg['Subject'] = r.get('QW_SUBJ') or "Quota warning" msg['Date'] = formatdate(localtime=True) text_part = MIMEText(text, 'plain', 'utf-8') html_part = MIMEText(html, 'html', 'utf-8') msg.attach(text_part) msg.attach(html_part) msg['To'] = username p = Popen([ '/usr/lib/dovecot/dovecot-lda', '-d', username, '-o', '"plugin/quota=maildir:User quota:noenforcing"' ],
rows = cur.fetchall() for row in rows: entry_date = row['Modified'] entry = dayonelib.DayOneEntry() entry.time = entry_date entry.tags = ['pagico', 'interaction'] # unseralize the body of the note row_content = loads(row['Content']) # entry body text entry_text = "%s" % (row_content['Body']) entry_text = html2text.html2text(entry_text) # entry title entry_title = row_content['Title'] # All notes on a contact will have a parent. Skip anything without a parent if row["ParentID"] is not None: # Get contact info parent_query = 'SELECT * FROM mach WHERE UID="%s"' % ( row['ParentID']) cur.execute(parent_query) parent = cur.fetchone() parent_content = loads(parent['content']) # Make sure the parent is a contact(type Profile) if parent['Type'] == 'Profile':
def get_mark(html): return mdv.main(ht.html2text(html))
import urllib.request as urllib import bs4 import html2text url ='https://de.finance.yahoo.com/q/ao?s=GILD' beautiful = urllib.urlopen(url).read() soup = bs4.BeautifulSoup(beautiful, 'lxml') ''' with open('out.txt', 'w') as f: f.write(soup.prettify()) ''' txt = html2text.html2text(soup.get_text()) str1 = "Empfehlung (diese Woche):"; len_val = 3 str1_and_value = (txt[txt.find(str1):txt.find(str1) + len(str1) + len_val] str1_value = txt[txt.find(str1)+ len(str1):txt.find(str1) + len(str1) + len_val]
def summary_txt(self): if self.communication_type == "individual": return self.summary else: return html2text.html2text(self.summary)
url = Goldcoasturl vm = "Mtippett-vm2" elif City == "Adelaide": url = Adelaideurl vm = "Mtippett-vm3" elif City == "Hobart": url = Hobarturl vm = "Apac-intern-vm3" elif City == "Melbourne": url = Melbourneurl vm = "Apac-intern-vm3" elif City == "Perth": url = Perthurl vm = "Mtippett-vm1" filename = folder + "/" + City + "/" + datafile + ".txt" if not os.path.exists(filename): print "Copying from " + url + " to " + filename html2text.BODY_WIDTH = 0 # displays word wrapping so no line breaks are made in the middle of a line. html_content = urllib2.urlopen(url).read() # read the html page rendered_content = html2text.html2text( html_content ) # convert it to text and save to variable rendered_content f = open(filename, 'w') f.write(rendered_content) #save the txt file f.close() copyfile( filename, "//" + vm + "//c$//Users//thanh//Desktop//Weather_Data//" + City + "//" + datafile + ".txt")
def test_function(fn, func_args): with open(fn) as inf: actual = html2text.html2text(inf.read(), **func_args) result = get_baseline(fn) assert result == actual
from urllib import request from bs4 import BeautifulSoup from html2text import html2text def retrieve_article(link): html_doc = request.urlopen(link).read().decode('utf-8') soup = BeautifulSoup(html_doc, 'html.parser') return soup.body.find(id="content").article # if a link matches Kata{0-9}{0-9} grammar = re.compile("Kata\d\d") # root of kata links html_doc = request.urlopen("http://codekata.pragprog.com/").read().decode( 'utf-8') soup = BeautifulSoup(html_doc, 'html.parser') links = { link.get("href") for link in soup.find_all('a') if grammar.findall(str(link)) } # retrieve the text on a kata site and write it as a file for link in links: uri = link.split("/")[-2] pathlib.Path(uri).mkdir(exist_ok=True) with open("{0}/kata.md".format(uri), "w") as kata: kata.write(html2text(str(retrieve_article(link))))
#!/usr/bin/env python import sys from definitions import Definitions from html2text import html2text title = sys.argv[1] definition = Definitions.article(title) txt = html2text(definition) print txt.encode('utf-8')
def to_text(self): return html2text.html2text(self.html)
try: article.download() except: print("Failed when downloading") sys.exit(2) try: article.parse() except: print("Error during parsing article") sys.exit(2) #soup = BeautifulSoup(article.article_html, 'html.parser') #soup.find('div', id="header").decompose() data = html2text.html2text(article.article_html) # print(article.text) # print(article.title) print('Writing to post.md') fd = open('post.md', 'w') if (article.title): fd.write(article.title) fd.write("\n\n") baseURL = getBaseURL(webURL) viaURL = 'via [' + baseURL + '](' + webURL + ')'
async def server(self, ctx, ip="opmines.net"): """ async with aiohttp.ClientSession() as session: async with session.get("https://api.mcsrvstat.us/1/opmines.net", headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}) as resp: st = await resp.read()""" st = requests.get("https://api.mcsrvstat.us/1/" + ip) print(st) st = st.text print(st) serverdata = json.loads(st) players = "" p = 0 hasp = False if "players" in serverdata: if "list" in serverdata['players']: hasp = True for member in serverdata['players']['list']: p += 1 if len(players) != 0: players += " , " if member.lower() in opstaff: players += "**" + member + "** :star:" else: players += member embed = discord.Embed(title="Server Status") if ip == "opmines.net": embed.set_thumbnail( url="http://files.enjin.com/780854/opmines%20logo%20222.png") embed.add_field( name="Status", value="Online" if not "offline" in serverdata else "Offline") if hasp == True: per = math.floor((serverdata['players']['online'] / serverdata['players']['max']) * 1000) / 10 embed.add_field(name="Members Online", value=str(serverdata['players']['online']) + " / " + str(serverdata['players']['max']) + " (" + str(per) + "%)") try: embed.add_field(name="Connection Stats", value=serverdata['ip'] + ":" + str(serverdata['port']) + " / " + serverdata['version'], inline=True) except: embed.add_field(name="Connection Stats", value=serverdata['ip'] + ":" + str(serverdata['port']) + " / ?", inline=True) try: embed.add_field(name="MOTD", value=html2text.html2text( serverdata['motd']['html'][0]).replace( "\\", "").replace("</span>", "").replace( "\n", "").replace("*", "\\*").replace( "_", "\\_").replace("~", "\\~")) except: embed.add_field(name="MOTD", value="?") try: embed.add_field(name="Player List (" + str(p) + ")", value=players) except: embed.add_field(name="Player List (unable to retrieve)", value="This server may be offline") await ctx.send(embed=embed)
def get_items(self, tree, vulns, test): """ @return hosts A list of Host instances """ x = list() if tree is None: return x for nodes in tree.iter('nodes'): "in nodes" for node in nodes.iter('node'): host = dict() host['name'] = node.get('address') host['hostnames'] = set() host['os'] = "" host['services'] = list() # host['vulns'] = self.parse_tests_type(node, vulns) for names in node.iter('names'): for name in list(names): host['hostnames'].add(name.text) for endpoints in node.iter('endpoints'): for endpoint in list(endpoints): svc = { 'protocol': endpoint.get('protocol'), 'port': endpoint.get('port'), 'status': endpoint.get('status'), } for services in endpoint.iter('services'): for service in list(services): svc['name'] = service.get('name') svc['vulns'] = self.parse_tests_type( service, vulns) for configs in service.iter('configurations'): for config in list(configs): if "banner" in config.get('name'): svc['version'] = config.get('name') host['services'].append(svc) x.append(host) dupes = {} for item in x: for service in item['services']: for vuln in service['vulns']: for sev, num_sev in Finding.SEVERITIES.items(): if num_sev == vuln['severity']: break dupe_key = sev + vuln['name'] if dupe_key in dupes: find = dupes[dupe_key] dupe_text = html2text.html2text(vuln['pluginOutput']) if dupe_text not in find.description: find.description += "\n\n" + dupe_text else: refs = '' for ref in vuln['refs'][2:]: if ref.startswith('CA'): ref = "https://www.cert.org/advisories/" + ref + ".html" elif ref.startswith('CVE'): ref = "https://cve.mitre.org/cgi-bin/cvename.cgi?name=" + ref refs += ref refs += "\n" find = Finding( title=vuln['name'], description=html2text.html2text( vuln['desc'].strip()) + "\n\n" + html2text.html2text(vuln['pluginOutput'].strip()), severity=sev, numerical_severity=Finding.get_numerical_severity( sev), mitigation=html2text.html2text(vuln['resolution']), impact=vuln['refs'][0], references=refs, test=test, active=False, verified=False, false_p=False, duplicate=False, out_of_scope=False, mitigated=None, dynamic_finding=True) find.unsaved_endpoints = list() dupes[dupe_key] = find find.unsaved_endpoints.append( Endpoint(host=item['name'], product=test.engagement.product)) for hostname in item['hostnames']: find.unsaved_endpoints.append( Endpoint(host=hostname, product=test.engagement.product)) for service in item['services']: if len(service['vulns']) > 0: find.unsaved_endpoints.append( Endpoint( host=item['name'] + (":" + service['port']) if service['port'] is not None else "", product=test.engagement.product)) return list(dupes.values())
.replace("{date}",datetime.datetime.today().strftime("%d/%m/%Y"))\ .replace("{b64email}",base64.b64encode(email))\ .replace("{b64remail}",base64.b64encode(email)[::-1]) if re.search("{randomint}", body): ri = random.randint(1, 9999999) print "Random integer: " + email + " : " + str(ri) body = body.replace("{randomint}", str(ri)) randomints = True fp = open(intsfile, "a") re.write(email + ":" + str(ri)) fp.close() msg.attach(MIMEText(body, "html")) if args.text: msg.attach(MIMEText(html2text.html2text(body), 'plain')) # Find any embedded images and attach attachments = re.findall('src="cid:([^"]+)"', body) for attachment in attachments: fp = open(attachment, "rb") img = MIMEImage(fp.read()) fp.close() img.add_header('Content-ID', attachment) msg.attach(img) # Optional attachment if args.attachment: filename = os.path.basename(args.attachment) part = MIMEBase('application', "octet-stream") part.set_payload(open(args.attachment, "rb").read())
def convert_html_to_text(result_txt): capt = b'%s' % (result_txt) convert_byte_to_str = capt.decode('utf-8') return html2text.html2text(convert_byte_to_str)