Python html2text Examples, html2text.html2text Python Examples

Example #1

0

Show file

File: main.py Project: vartagg/moodle2edx

 def get_moodle_section(self, sectionid, chapter, activity_title=""):
     '''
     sectionid is a number
     '''
     sdir = 'sections/section_%s' % sectionid
     xml = etree.parse('%s/%s/section.xml' % (self.moodle_dir, sdir)).getroot()
     name = xml.find('name').text
     contents = xml.find('summary').text
     if contents is None:
         contents = ''
     contents = contents.replace('<o:p></o:p>', '')
     # if moodle author didn't bother to set name, but instead used <h2> then grab name from that
     if not name or name=='$@NULL@$':
         m = re.search('<h2(| align="left")>(.*?)</h2>', contents)
         if m:
             name = html2text.html2text(m.group(2))
             name = name.replace('\n','').replace('\r','')
     if not name or name=='$@NULL@$':
         htext = html2text.html2text(contents)
         # print "Warning: empty name for section %s, contents=%s ..." %  (sectionid, htext.split('\n')[0].strip())
         name = htext[:50].split('\n')[0].strip()
     if not name:
         name = activity_title.strip().split('\n')[0].strip()[:50]
     name = name.strip()
     print "--> Section: %s" % name
     chapter.set('display_name', name)
     if contents:
         seq = etree.SubElement(chapter,'sequential')
         self.set_sequential_name(seq, name)
         url_name = self.make_url_name('section_%s__%s' % (sectionid, name), dupok=False)
         self.save_as_html(url_name, name, contents, seq)
         return seq
     return None

Example #2

0

Show file

File: trac_export.py Project: pombredanne/incubator-allura

    def parse_ticket(self, id):
        # Use CSV export to get ticket fields
        url = self.full_url(self.TICKET_URL % id, 'csv')
        f = self.csvopen(url)
        reader = csv.DictReader(f)
        ticket_fields = reader.next()
        ticket_fields['class'] = 'ARTIFACT'
        ticket = self.remap_fields(ticket_fields)

        # Use RSS export to get ticket description and comments
        import html2text
        html2text.BODY_WIDTH = 0
        url = self.full_url(self.TICKET_URL % id, 'rss')
        self.log_url(url)
        d = feedparser.parse(urlopen(url))
        ticket['description'] = html2text.html2text(d.feed.description)
        comments = []
        for comment in d['entries']:
            c = {}
            c['submitter'] = getattr(comment, 'author', None)
            c['date'] = comment.updated_parsed
            c['comment'] = html2text.html2text(comment.summary)
            c['class'] = 'COMMENT'
            comments.append(c)
        ticket['comments'] = comments
        return ticket

Example #3

0

Show file

File: views.py Project: Hoot215/fictionhub

def prompt_repost(request, story):
    story = Post.objects.get(slug=story)
    prompt = story.parent.body
    prompt_url = story.parent.reddit_url    

    r = praw.Reddit(user_agent='Test Script by /u/raymestalez')
    r.login(os.environ["REDDIT_USERNAME"],os.environ["REDDIT_PASSWORD"])
    subreddit = r.get_subreddit('OrangeMind')
    # thread = list(r.search("[WP]", subreddit=subreddit, sort="new", syntax='cloudsearch'))[0]

    if story.reddit_url:
        comment = r.get_submission(story.reddit_url).comments[0]
        comment.edit(html2text(story.body))
    else:
        thread = r.get_submission(url = prompt_url)
        comment = thread.add_comment(html2text(story.body))
        story.reddit_url = comment.permalink
        story.save()
        
    teststring = "<br/>url: " + prompt_url + \
                 "<br/>Reddit premalink: " + comment.permalink

    return render(request, 'posts/test.html', {
        'teststring': teststring,
    })

Example #4

0

Show file

File: start_trending_topics_extractor_threads.py Project: jtuburon/movies_importer

	def run_thread(self, start, lim):
		client = MongoClient('localhost', 27017)
		my_db = client['Grupo07']

		questions= my_db.movies_questions.find().sort("question_id", 1).skip(start).limit(lim)
		i=0
		for q in questions:
			i= i+1
			print str(start) +" : " +str(i)
			q_topics= []
			q_topics = q_topics + self.extract_topics_list(q['title'])	
			q_topics = q_topics + self.extract_topics_list(html2text.html2text(q['body']))
			if "answers" in q.keys():
				for a in q["answers"]:
					q_topics = q_topics + self.extract_topics_list(html2text.html2text(a['body']))	
			topics_dict={}
			for w in q_topics:
				if w in topics_dict:
					topics_dict[w]=topics_dict[w] + 1
				else:
					topics_dict[w]=1

			for k in topics_dict.keys():
				topicObj = my_db.movies_trending_topics.find_one({"word": k})
				if topicObj!=None:		
					topicObj['count']=topicObj['count']+topics_dict[k];			
					my_db.movies_trending_topics.update({'_id': topicObj['_id']}, {"$set": topicObj}, upsert=False)
				else:
					topicObj= {"word": k, "count": topics_dict[k]}
					my_db.movies_trending_topics.insert(topicObj)

Example #5

0

Show file

File: MultiParser.py Project: weidler/tyrex

	def readFileAtPath(self, posix_path):
		"""
		Reads a file at a given path. Looks for utf-8/latin-1 encoding. Converts HTML Markup to Text.
		Class counts failed attempts to read.

		@parameters
		posix_path		string	the concerned filepath at which the method should read

		@returns		string	html-free content of filepath
						bool	FALSE if encoding unknown or file not found
		"""

		print("parsing: "+posix_path.name)
		try:
			with posix_path.open(encoding="utf-8") as f:  # general encoding
				return html2text(f.read())
		except UnicodeDecodeError:
			try:
				with posix_path.open(encoding="latin-1") as f:  # german language encoding
					return html2text(f.read())
			except:
				self.fails.append(posix_path.name)
				return False
		except:
			self.fails.append(posix_path.name)
			return False

Example #6

0

Show file

File: _import_portfolio.py Project: avishayhajbi/django-sidar-apps

    def handle(self, *args, **options):
        for row in all_portfolio_rows():
            w = Work.objects.create(
                name_he=row.get(u'שם העבודה', ''),
                name_en=row.get('Document Title', ''),
                description_he=html2text(row.get(u'תאור', '')),
                description_en=html2text(row.get(u'Description', '')),
                discipline=match_discipline(row),
                country=match_country(row.get(u'ארץ', '')),
                designer=Designer.objects.get_or_create(
                    name_he=row.get(u'מעצב', ''),
                    defaults={'name_en': row.get('Designer', '')})[0],
                category=match_category(row.get(u'קטגוריה')),
                size_as_text=row.get(u'גודל', ''),
                publish_date_as_text=row.get(u'תאריך', ''),
                publish_year=int(row.get(u'תאריך', ''))if row.get(
                    u'תאריך', '').isdigit() else None,
                client=row.get(u'לקוח', ''),
                technique=match_technique(row.get(u'טכניקה', '')),
                is_self_collected=match_is_self_collected(
                    row.get(u'מעצב', ''),
                    row.get(u'מאוסף', '')),
                raw_image=File(open(os.path.join(
                    settings.PORFOLIO_IMAGE_DIR,
                    row['Filename'])))
            )
            w.subjects = match_subject(row.get(u'נושא'))
            w.of_collections = match_collector(row.get(u'מעצב', ''),
                                               row.get(u'מאוסף', ''))

            for keyword in [keyword.strip() for keyword in html2text(row.get(u'מילות מפתח', '')).split(',')]:
                if keyword:
                    w.tags.add(keyword)

Example #7

0

Show file

File: parsers.py Project: actics/acmcli

def _set_text_and_samples(tree: lxml.html.HtmlElement, problem: Problem) -> None:
    text = tree.get_element_by_id('problem_text')
    source = text.find_class('problem_source')[0]
    source.getparent().remove(source)
    input_next = False
    output_next = False
    for div in text.iterchildren():
        if div.text in ['Input', 'Исходные данные']:
            input_next = True
        elif div.text in ['Output', 'Результат']:
            output_next = True
        elif input_next:
            input_next = False
            problem.input = html2text(lxml.html.tostring(div).decode('utf-8')).strip()
        elif output_next:
            output_next = False
            problem.output = html2text(lxml.html.tostring(div).decode('utf-8')).strip()
        else:
            continue
        div.getparent().remove(div)
    samples = text.find_class('sample')
    if len(samples) == 1:
        sample = samples[0]
        sample_h3 = text.find_class('problem_subtitle')
        sample_texts = [u'Sample', u'Пример', u'Samples', u'Примеры']
        sample_h3 = next(x for x in sample_h3 if x.text in sample_texts)
        sample.getparent().remove(sample)
        sample_h3.getparent().remove(sample_h3)
        intables = sample.find_class('intable')
        problem.sample_inputs = [x.text.rstrip() for x in intables[0::2]]
        problem.sample_outputs = [x.text.rstrip() for x in intables[1::2]]
    problem.text = html2text(lxml.html.tostring(text).decode('utf-8')).strip()

Example #8

0

Show file

File: script_init_env.py Project: bukun/pycate

def build_whoosh_database():
    analyzer = ChineseAnalyzer()
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer), type=TEXT(stored=True), link=ID(stored=True),
                    content=TEXT(stored=True, analyzer=analyzer))
    ix = create_in(whoosh_database, schema)

    writer = ix.writer()


    uu = MApp()

    tt = uu.get_all()
    for rec in tt:
        text2 = html2text.html2text(tornado.escape.xhtml_unescape(rec.cnt_html))
        writer.add_document(
             title=rec.title,
             type='<span style="color:red;">[信息]</span>',
             link='/info/{0}'.format(rec.uid),
             content= text2,
        )

    mpost = MPost()
    recs = mpost.query_all()
    for rec in recs:
        text2 = html2text.html2text(tornado.escape.xhtml_unescape(rec.cnt_html))
        print(text2)
        writer.add_document(
            title=rec.title,
            type='<span style="color:blue;">[文档]</span>',
            link='/post/{0}.html'.format(rec.uid),
            content=text2
        )


    writer.commit()

Example #9

0

Show file

 def get_plain_text(self):
     action = html2text(smart_str(self.action)).rstrip()
     effect = html2text(smart_str(self.effect)).rstrip()
     setup = html2text(smart_str(self.setup)).rstrip()
     breakdown = html2text(smart_str(self.breakdown)).rstrip()
     return PlainText(action=action, setup=setup,
                      effect=effect, breakdown=breakdown)

Example #10

0

Show file

File: minutes.py Project: Aunsiels/Scripts

def write_page(url, number):
    """write_page
    Write the text in a html page in a file.
    The file name is composed of the date and a number.
    :param url: The url where the page can be found
    :param number: A number to make files unique
    """
    html = get_html_page(url)
    date = find_date_page(html)
    if date:
        cat, title, snippet, text = find_text_page(html)
        text = html2text.html2text(text).replace("\\n", " ")
        text = text.replace("\\r", "").replace("\n", " ")
        title = html2text.html2text(title).replace("\\n", " ")
        title = title.replace("\\r", "").replace("\n", " ")
        snippet = html2text.html2text(snippet).replace("\\n", " ")
        snippet = snippet.replace("\\r", "").replace("\n", " ")
        cat = html2text.html2text(cat).replace("\\n", " ")
        cat = cat.replace("\\r", "").replace("\n", " ")
        print("Writing file number ", str(number).zfill(3), url)
        fstream = open("20minutes/"+date+"-"+str(number).zfill(3), "w", encoding="utf-16")
        fstream.write("<category>\n")
        fstream.write(cat)
        fstream.write("\n<\\category>\n")
        fstream.write("<title>\n")
        fstream.write(title)
        fstream.write("\n<\\title>\n")
        fstream.write("<snippet>\n")
        fstream.write(snippet)
        fstream.write("\n<\\snippet>\n")
        fstream.write("<article>\n")
        fstream.write(text)
        fstream.write("\n<\\article>")
        fstream.close()

Example #11

0

Show file

File: migrate.py Project: bdarcus/bitbucket_issue_migration

def scrape_comments(issue):
    # This is a hack since the current BitBucket api does not support pulling comments.
    url = "https://bitbucket.org/%s/%s/issue/%s" % (
        options.bitbucket_username,
        options.bitbucket_repo,
        issue["local_id"],
    )
    content = urllib2.urlopen(url).read()
    bs = BeautifulSoup(content)
    comments = []
    for comment in bs.findAll("li", {"class": " comment-content"}):
        body = comment.find("div", {"class": "issues-comment edit-comment"})
        if body:
            body = html2text(unicode(body))
        else:
            # This is not a comment it is a issue change
            body = html2text(unicode(comment.find("ul", {"class": "issue-changes"})))
        body = clean_body(body)
        user = "******"
        try:
            user = comment.findAll("a")[1].getText()
        except IndexError:
            pass

        created_at = comment.find("time").get("datetime")
        number = int(comment.find("a", {"class": "issues-comments-permalink"}).getText().replace("#", ""))

        comments.append({"user": user, "created_at": created_at, "body": body.encode("utf-8"), "number": number})

    return comments

Example #12

0

Show file

File: rtc.py Project: frediz/rtccli

def workitem_details(client, workitemid):
    wi = Workitem.getOne(client, workitemid, '?oslc_cm.properties=dc:identifier,\
        dc:type{dc:title},dc:title,rdf:resource,dc:creator{dc:title},\
        rtc_cm:ownedBy{dc:title},dc:description,rtc_cm:state{dc:title},\
        rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.parent{dc:identifier,dc:title},\
        rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.children,\
        rtc_cm:com.ibm.team.workitem.linktype.relatedworkitem.related')
    print
    print "=================================================================="
    print "Workitem ID : " +cl.str(str(wi.js['dc:identifier']), cl.fg.green)+' ('+wi.js['dc:type']['dc:title']+')'
    print "Title       : " +cl.str(wi.js['dc:title'], cl.fg.red)
    print "URL         : " +wi.js['rdf:resource']
    print "State       : " +wi.stateColorize(wi.js['rtc_cm:state']['dc:title'])
    print "Creator     : " +wi.js['dc:creator']['dc:title']
    print "Owner       : " +wi.js['rtc_cm:ownedBy']['dc:title']
    if len(wi.js['rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.parent']) != 0:
        par = wi.js['rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.parent'][0]
        print "Parent      : " + str(par['dc:identifier'])+" ("+par['dc:title']+")"
    if len(wi.js['rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.children']) != 0:
        print "Child(ren)  : " + reduce((lambda a, b: a +", "+ b), map((lambda a: re.sub(r'([^:]+): (.*)', r'\1 (\2)', a['oslc_cm:label'])), wi.js['rtc_cm:com.ibm.team.workitem.linktype.parentworkitem.children']))
    if len(wi.js['rtc_cm:com.ibm.team.workitem.linktype.relatedworkitem.related']) != 0:
        print "Related     : " + reduce((lambda a, b: a +", "+ b), map((lambda a: re.sub(r'([^:]+): (.*)', r'\1 (\2)', a['oslc_cm:label'])), wi.js['rtc_cm:com.ibm.team.workitem.linktype.relatedworkitem.related']))
    print "Description :"
    print html2text.html2text(wi.js['dc:description'])
    comments = wi.get_comments()
    if len(comments) == 0:
        return
    print "Comments :"
    i = 0
    for c in comments:
        print str(i) + ": " +cl.str(c['dc:creator']['dc:title'], cl.fg.green)+" ("+c['dc:created'] + ") :"
        print html2text.html2text(c['dc:description'])
        i = i + 1

Example #13

0

Show file

File: scrtxt.py Project: hexuotzo/khufu

def getbody(html):
    bodytxt=[]
    try:
        txt = html2text(html.decode('utf8'))
    except:
        txt = html2text(html.decode('gbk'))    
    try:
        txt = txt.encode('utf8').split('\n')
    except:
        txt = txt.encode('gbk').split('\n')
    for r in txt:
        r = r.strip()
        if r == '':continue
        if r in bodytxt:pass      #pass repeat
        elif r[0] == '[':
            if len(bodytxt) >= 5:break
            else:pass
        elif ']' in r[-4:] and len(bodytxt) < 5:pass  #pass writer's name
        elif r[4:] == '****':break                #body end
        elif r[5:]<chr(127):pass            #pass address
        elif r[0].isdigit() and r[2] > chr(127):bodytxt.append(r)  #int+chinese
        elif r[0] == '#' and r[1] != '#':        # title
            try:
                if bodytxt[-1][0] == '#':bodytxt=[]
                else:bodytxt.append(r)
            except:
                bodytxt.append(r)
        elif r[:2] == '**':bodytxt.append(r)      # branch title
        elif r[0] > chr(127):bodytxt.append(r)    #add chinese to bodytxt
        elif r[0] == '#' and len(bodytxt) >= 5:break
    return '\n'.join(bodytxt)

Example #14

0

Show file

File: zhihu.py Project: shirleyChou/zhihu_crawler

 def all_answers(self):
     self.answer_num
     find = self.soup.find_all(class_=' zm-editable-content clearfix',
                               limit=self.num)
     for index, answer in enumerate(find):
         print '第%d个答案:\n' % (index+1)
         print html2text.html2text(str(answer))

Example #15

0

Show file

File: trac_export.py Project: abhinavthomas/allura

    def parse_ticket(self, id):
        # Use CSV export to get ticket fields
        url = self.full_url(self.TICKET_URL % id, 'csv')
        f = self.csvopen(url)
        reader = csv.DictReader(f)
        ticket_fields = reader.next()
        ticket_fields['class'] = 'ARTIFACT'
        ticket = self.remap_fields(ticket_fields)

        # Use HTML export to get ticket description and comments
        import html2text
        html2text.BODY_WIDTH = 0
        url = self.full_url(self.TICKET_URL % id)
        self.log_url(url)
        d = BeautifulSoup(urlopen(url))
        self.clean_missing_wiki_links(d)
        desc = d.find('div', 'description').find('div', 'searchable')
        ticket['description'] = html2text.html2text(
            desc.renderContents('utf8').decode('utf8')) if desc else ''
        comments = []
        for comment in d.findAll('form', action='#comment'):
            c = {}
            c['submitter'] = re.sub(
                r'.* by ', '', comment.find('h3', 'change').text).strip()
            c['date'] = self.trac2z_date(
                comment.find('a', 'timeline')['title'].replace(' in Timeline', ''))
            changes = unicode(comment.find('ul', 'changes') or '')
            body = comment.find('div', 'comment')
            body = body.renderContents('utf8').decode('utf8') if body else ''
            c['comment'] = html2text.html2text(changes + body)
            c['class'] = 'COMMENT'
            comments.append(c)
        ticket['comments'] = comments
        return ticket

Example #16

0

Show file

File: pipelines.py Project: sharecqy/you-learn-i-learn

    def process_item(self,item,spider):
        if self.notThisPipeline(spider):
            return item
        hxs = HtmlXPathSelector(text=item["raw"])
        title=hxs.select("//*[contains(@class, 'ask-title')]/text()")
	if len(title):
        	item['title']=title.extract()[0]
	else:
		raise DropItem()

        content=hxs.select("//*[contains(@class, 'q-content')]")
	if len(content):
        	item['content']=html2text.html2text(content[0].extract())
	else:
		item['content']=''
        best_answer=hxs.select("//*[contains(@class, 'best-text')]")
        if len(best_answer):
            item['best_answer']=html2text.html2text(best_answer[0].extract())
        else:
            item['best_answer']=""
        anss=hxs.select("//*[contains(@class, 'answer-text')]")
        ext_ans=[]
        for ans in anss:
            ext_ans.append(html2text.html2text(ans.extract()))
        item['answers']=ext_ans

        return item

Example #17

0

Show file

File: views.py Project: Hoot215/fictionhub

def fp_import(request):
    author = request.user

    url = Util.objects.get(pk=1).ffnet_url
    
    munger = Munger(url, FPAdapter())
    imported_story = munger.DownloadStory()

    imported_story_title = str(imported_story.title)

    try:
        story = Post.objects.get(slug=slugify(imported_story_title))
    except:
        story = Post()
    story.title = imported_story_title
    story.author = author
    story.post_type = "story"
    story.imported = True
    story.rational = True    
    story.published = True


    if imported_story.chapters[0].title:
        story.body = " "
    else:
        contents = imported_story.chapters[0].contents
        contents = html2text(str(contents))
        story.body = contents
    story.save()

    teststring = "Imported: " + story.title + "<br/>"

    if imported_story.chapters[0].title:
        for index, imported_chapter in enumerate(imported_story.chapters):
            title = imported_chapter.title.split(".",1)[1].strip()
            # title = story.title + "| Chapter " + str(story.children.count()+1)
            contents = imported_chapter.contents
            
            contents = html2text(str(contents))
    
            try:
                chapter = Post.objects.get(slug=slugify(title))
            except:
                chapter = Post()
            chapter.title = title
            chapter.body = contents
            chapter.number = index + 1
            chapter.author = author
            chapter.post_type = "chapter"
            chapter.imported = True
            chapter.rational = True
            chapter.parent = story
            chapter.save()
            teststring += "Imported: " + chapter.title + "<br/>"
        
    
    return render(request, 'posts/import.html', {
        'teststring': teststring,
    })

Example #18

0

Show file

File: 0007_import_work_description.py Project: roeezor/sidar

 def forwards(self, orm):
     "Write your forwards methods here."
     # Note: Remember to use orm['appname.ModelName'] rather than "from appname.models..."
     for row in all_portfolio_rows():
         work = orm.Work.objects.get(sidar_id=remove_file_extension(row['Filename']))
         work.description_he = html2text(row[u'תאור']).strip()
         work.description_en = html2text(row[u'Description']).strip()
         work.save()

Example #19

0

Show file

File: helpers.py Project: GunioRobot/py-slinger

 def decorated(*args, **kwargs):
     response = func(*args)
     try:
         print '%s %s: %s' % (kwargs.get('label', 'Node'), statuses[response[0]['status']], args[0])
     except:
         try:
             print html2text(response[1]).replace('\n\n', '\n')
         except:
             print response[1]

Example #20

0

Show file

File: StdoutNotifier.py Project: cyrusboadway/RSS-Scan-Report

 def notify(self, subject, message):
     '''
     Output the subject and message, after converting the message html to
     markdown text.
     '''
     
     print " == " + subject + " == \n"
     
     print html2text.html2text(message)

Example #21

0

Show file

File: views.py Project: jayceelock/sterg_info_screen

def get_weather():
	
	import html2text
	import urllib2
	
	max_temp = html2text.html2text(urllib2.urlopen('http://weather.sun.ac.za/api/getlivedata.php?maxtemp').read())
	cur_temp = html2text.html2text(urllib2.urlopen('http://weather.sun.ac.za/api/getlivedata.php?temperature').read())
	
	return max_temp, cur_temp

Example #22

0

Show file

File: importer.py Project: rouk1/harmonic

        def copy_data(target, data):
            copy_seo_data(target, data)
            translation.activate('en')
            target.content = html2text.html2text(data.text_en)
            translation.deactivate()

            translation.activate('fr')
            target.content = html2text.html2text(data.text_fr)
            translation.deactivate()

Example #23

0

Show file

File: importer.py Project: rouk1/harmonic

def import_page(name, path):
    with open(os.path.join(path, name, 'data'), 'rb') as f:
        data = pickle.load(f)

        def copy_data(target, data):
            copy_seo_data(target, data)
            translation.activate('en')
            target.content = html2text.html2text(data.text_en)
            translation.deactivate()

            translation.activate('fr')
            target.content = html2text.html2text(data.text_fr)
            translation.deactivate()

        if isinstance(data, models.HomePage):
            hp = HomePage.get_solo()

            copy_data(hp, data)

            push = HomePagePush()
            push.home_page = hp
            translation.activate('en')
            push.title = data.push_title_en
            push.content = html2text.html2text(data.push_content_en)
            translation.deactivate()

            translation.activate('fr')
            push.title = data.push_title_fr
            push.content = html2text.html2text(data.push_content_fr)
            translation.deactivate()
            push.save()

            hp.save()

        else:
            p = Page()
            p.slug = slugify(data.title_en.lower())

            copy_data(p, data)
            translation.activate('en')
            p.title = data.title_en
            translation.deactivate()

            translation.activate('fr')
            p.title = data.title_fr
            translation.deactivate()

            if hasattr(data, 'background'):
                img = make_master_image(
                    path,
                    data.background,
                    '{}-background'.format(data.title_en)
                )
                p.background = img

            p.save()

Example #24

0

Show file

File: hdfc.py Project: aorborc/banks-api

	def parse_account_statement(self,html):
		ret = []
		soup = BeautifulSoup(html)
		for table in soup.findAll('table',{'class':'tableRd'}):
			rec = {}
			for row in table.findAll('tr'):
				cols = row.findAll('td')
				rec.update( { html2text.html2text(cols[0].find(text=True)).strip('\n:') :  html2text.html2text(cols[1].find(text=True)).strip('\n:') })
			ret.append(rec)
		return ret

Example #25

0

Show file

File: ods.py Project: javanna2000/udata

    def process(self, item):
        ods_dataset = item.kwargs["dataset"]
        dataset_id = ods_dataset["datasetid"]
        ods_metadata = ods_dataset["metas"]

        if not ods_dataset.get('has_records'):
            msg = 'Dataset {datasetid} has no record'.format(**ods_dataset)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        dataset.title = ods_metadata['title']
        dataset.frequency = "unknown"
        description = ods_metadata.get("description", '').strip()
        description = html2text.html2text(description.strip('\n').strip())
        dataset.description = html2text.html2text(description).strip('\n')
        dataset.private = False

        tags = set()
        if "keyword" in ods_metadata:
            if isinstance(ods_metadata['keyword'], list):
                tags |= set(ods_metadata['keyword'])
            else:
                tags.add(ods_metadata['keyword'])

        if "theme" in ods_metadata:
            if isinstance(ods_metadata["theme"], list):
                for theme in ods_metadata["theme"]:
                    tags.update([t.strip().lower() for t in theme.split(",")])
            else:
                themes = ods_metadata["theme"].split(",")
                tags.update([t.strip().lower() for t in themes])

        dataset.tags = list(tags)

        ods_license_id = ods_metadata.get('license')
        if ods_license_id and ods_license_id in self.LICENSES:
            license_id = self.LICENSES[ods_license_id]
            dataset.license = License.objects.get(id=license_id)

        dataset.resources = []

        self.process_resources(dataset, ods_dataset, ('csv', 'json'))

        if 'geo' in ods_dataset['features']:
            self.process_resources(dataset, ods_dataset, ('geojson', 'shp'))

        dataset.extras["ods:url"] = self._get_explore_url(dataset_id)
        if "references" in ods_metadata:
            dataset.extras["ods:references"] = ods_metadata["references"]
        dataset.extras["ods:has_records"] = ods_dataset["has_records"]

        return dataset

Example #26

0

Show file

File: hdfc.py Project: aorborc/banks-api

	def parse_accounts(self,html):
		soup = BeautifulSoup(html)
		tables = soup.findAll('table',{'class':'tabdtl'})
		ret=[]
		for table in tables:
			rec = {}
			rows = table.findAll('tr')
			for row in rows:
				cols = row.findAll('td')
				rec.update( { html2text.html2text(cols[0].find(text=True)).strip('\n:') :  html2text.html2text(cols[1].find(text=True)).strip('\n:') })
			ret.append(rec)
		return ret

Example #27

0

Show file

File: views.py Project: pombredanne/django-dynamicforms

    def form_valid(self, form):
        # save the result
        data = DynamicFormData.objects.create(
                dynamicform   = self.dynamicform,
                raw_post_data = self.request.raw_post_data,
                headers       = '\n'.join(
                    '%s: %s' % (h, self.request.META[h])
                    for h in HTTP_HEADERS if h in self.request.META
                    )
                )

        # create confirmation e-mail
        if self.dynamicform.send_confirmation:
            recipients_template = Template(self.dynamicform.email_recipients)
            subject_template    = Template(self.dynamicform.email_subject)
            content_template    = Template(self.dynamicform.email_content)
            context = Context(form.cleaned_data)
            recipients = recipients_template.render(context)
            subject    = subject_template.render(context)
            content    = content_template.render(context)
            msg = EmailMultiAlternatives(
                    force_unicode(subject),
                    html2text(content),
                    settings.DEFAULT_FROM_EMAIL,
                    [address for name, address in rfc822.AddressList(recipients).addresslist],
                    )
            msg.attach_alternative(content, "text/html")
            msg.send()

        # create e-mail for dynamicform manager
        if self.dynamicform.notification_emails:
            recipients = self.dynamicform.notification_emails.split(u',')
            subject = _(u'Someone filled in your online form "%s"') % self.dynamicform.name
            context = RequestContext(
                self.request,
                {
                    'form':            form,
                    'dynamicform':     self.dynamicform,
                    'dynamicformdata': data,
                    'site':            Site.objects.get_current(),
                },
            )
            content = render_to_string(self.dynamicform.email_template, context_instance=context)
            msg = EmailMultiAlternatives(
                    force_unicode(subject),
                    html2text(content),
                    settings.SERVER_EMAIL,
                    [address for name, address in rfc822.AddressList(recipients).addresslist],
                    )
            msg.attach_alternative(content, "text/html")
            msg.send()

        return super(ProcessDynamicFormView, self).form_valid(form)

Example #28

0

Show file

File: DDLScraperCore.py Project: TerrorKeed/pgoeri-fork

	def _getPostInfo(self, value):
		if self.__dbg__:
			print self.__plugin__ + " _getPostInfo: " + value[1]

		try:
			post = {}
			
			post['url'] = value[0]
			post['raw_title'] = value[1]
			post['img'] = value[2]
			#raw_descr = value[3]
			raw_descr = ""
			
			"""
			# try to extract info from descr
			release_name = re.compile("release name:</strong>(.+?)<", re.IGNORECASE).findall(raw_descr)
			for tmp in release_name:
				post['release_name'] = tmp
				plot += "Release name: "+ tmp + "\n"
			
			tmps = re.compile("audio quality:</strong>(.+?)<", re.IGNORECASE).findall(raw_descr)
			for tmp in tmps:
				post['audio_quality'] = tmp
				plot += tmp + "\n"
				
			tmps = re.compile("video quality:</strong>(.+?)<", re.IGNORECASE).findall(raw_descr)
			for tmp in tmps:
				post['video_quality'] = tmp
				plot += tmp + "\n"
				
			tmps = re.compile("size:</strong>(.+?)<", re.IGNORECASE).findall(raw_descr)
			for tmp in tmps:
				post['size_str'] = tmp
				plot += tmp + "\n"
			"""
			
			# fill video specific fields, used by info dialog
			post['Title'] = html2text.html2text(post['raw_title'].decode('utf-8'))
			# use multiple lines for multiple titles
			post['Title'] = post['Title'].replace(" & "," &\n")
			#for now, just show raw_descr
			post['Plot'] = html2text.html2text(raw_descr.decode('utf-8'))
			# per default, there is always a next page (if not, will be set in parent function)
			post['next'] = "true"

			if self.__dbg__:
				print self.__plugin__ + " _getPostInfo done"
			return post;
		except:
			self._exception("_getPostInfo")
				
			return ( dict(), 500 )

Example #29

0

Show file

File: feed.py Project: wcaleb/rss2email

 def _get_entry_title(self, entry):
     if hasattr(entry, 'title_detail') and entry.title_detail:
         title = entry.title_detail.value
         if 'html' in entry.title_detail.type:
             title = _html2text.html2text(title)
     else:
         content = self._get_entry_content(entry)
         value = content['value']
         if content['type'] in ('text/html', 'application/xhtml+xml'):
             value = _html2text.html2text(value)
         title = value[:70]
     title = title.replace('\n', ' ').strip()
     return title

Example #30

0

Show file

File: pagecreeper.py Project: moonzwu/VulnerabilityObserver

    def selectValidCVEContent(self, soup):
        lis = soup.find_all('li')
        for li in lis:
            validCVEContent = html2text.html2text(li.get_text())
            if (validCVEContent.find("CVE ID:") != -1):
                return validCVEContent

        # deal with the no <li> tag for CVE ID case
        vulTxtContent = html2text.html2text(soup.get_text())
        startPos = vulTxtContent.find("CVE ID:")
        if (startPos != -1):
            return vulTxtContent[startPos:]
        else:
            return ''

Example #31

0

Show file

File: concent.py Project: U0001F3A2/golem

 def show(self):  # pylint: disable=no-self-use
     terms = sync_wait(self._call("show"))
     return html2text.html2text(terms)

Example #32

0

Show file

File: vanilla_django.py Project: gak/django-templated-email

    def get_email_message(self,
                          template_name,
                          context,
                          from_email=None,
                          to=None,
                          cc=None,
                          bcc=None,
                          headers=None,
                          template_prefix=None,
                          template_suffix=None,
                          template_dir=None,
                          file_extension=None,
                          attachments=None,
                          create_link=False):

        if create_link:
            email_uuid = uuid.uuid4()
            link_context = dict(context)
            context['email_uuid'] = email_uuid.hex
            for key, value in context.items():
                if isinstance(value, InlineImage):
                    link_context[key] = self.host_inline_image(value)

        EmailMessage = get_emailmessage_klass()
        EmailMultiAlternatives = get_emailmultialternatives_klass()
        parts = self._render_email(template_name, context, template_prefix
                                   or template_dir, template_suffix
                                   or file_extension)
        plain_part = 'plain' in parts
        html_part = 'html' in parts

        if create_link and html_part:
            static_html_part = self._render_email(
                template_name, link_context, template_prefix or template_dir,
                template_suffix or file_extension)['html']
            from templated_email.models import SavedEmail
            SavedEmail.objects.create(content=static_html_part,
                                      uuid=email_uuid)

        if 'subject' in parts:
            subject = parts['subject']
        else:
            subject_dict = getattr(settings, 'TEMPLATED_EMAIL_DJANGO_SUBJECTS',
                                   {})
            if isinstance(template_name, (list, tuple)):
                for template in template_name:
                    if template in subject_dict:
                        subject_template = subject_dict[template]
                        break
                else:
                    subject_template = _('%s email subject' % template_name[0])
            else:
                subject_template = subject_dict.get(
                    template_name, _('%s email subject' % template_name))
            subject = subject_template % context
        subject = subject.strip('\n\r')  # strip newlines from subject

        if html_part and not plain_part and html2text and \
                getattr(settings, 'TEMPLATED_EMAIL_AUTO_PLAIN', True):
            parts['plain'] = html2text.html2text(parts['html'])
            plain_part = True

        if plain_part and not html_part:
            e = EmailMessage(
                subject,
                parts['plain'],
                from_email,
                to,
                cc=cc,
                bcc=bcc,
                headers=headers,
                attachments=attachments,
            )

        if html_part and not plain_part:
            e = EmailMessage(
                subject,
                parts['html'],
                from_email,
                to,
                cc=cc,
                bcc=bcc,
                headers=headers,
                attachments=attachments,
            )
            e.content_subtype = 'html'

        if plain_part and html_part:
            e = EmailMultiAlternatives(
                subject,
                parts['plain'],
                from_email,
                to,
                cc=cc,
                bcc=bcc,
                headers=headers,
                attachments=attachments,
            )
            e.attach_alternative(parts['html'], 'text/html')

        self.attach_inline_images(e, context)
        return e

Example #33

0

Show file

File: doctor_epicrisis.py Project: proyecto-evoluzion/clinica_digital_consultorio

 def _get_signature(self):
     user = self.env.user
     signature = html2text.html2text(user.signature)
     return signature

Example #34

0

Show file

File: 50_EconomyAndForecastReview _TapChiKinhTeVaDuBao.py Project: cuongpianna/test_csv

 def parse_author(response):
     return response.xpath('//div[@class="left_news"]/div[@class="page_town_row"][last()]/text()').get() or \
            response.xpath('//div[@class="left_news"]/div[@class="page_town_row"]/p/strong/em/text()').get() or \
            response.xpath('//div[@class="left_news"]/div[last()]/p[last()]/text()').get() or html2text(response.xpath(
         '//div[@class="left_news"]//p[@class="post-source"]').get())

Example #35

0

Show file

 text2 = ""
 try:
     try:
         text = msg.get_body(
             preferencelist=('plain')).get_content()  # čistý text
         #print("metoda 1")
     except:
         if msg.is_multipart():
             for payload in msg.get_payload():
                 #print("metoda 2a")
                 # if payload.is_multipart(): ...
                 text2 = payload.get_payload()
         else:
             text2 = msg.get_payload()
             #print("metoda 2b")
         text = html2text.html2text(text2)
 except:
     text = ""
 #print(text2)
 text = text.replace('\n', ' ')
 #print(text)
 odesilatel = msg['from']  # odesílatel
 prijemce = msg['to']  # příjemce
 predmet = msg['subject']  # předmět
 datum = msg['date']  # datum
 #print ('To: %s' % prijemce) # příjemce
 #print ('From: %s' % odesilatel) # odesílatel
 #print ('Subject: %s' % predmet) # předmět
 #print ('Date: %s' % datum) # datum
 #print(text) # čistý text
 #print(msg) # celý text

Example #36

0

Show file

def create_email(sender_name, sender_email, inbox_uid, to_addr, cc_addr,
                 bcc_addr, subject, html, in_reply_to, references,
                 attachments):
    """
    Creates a MIME email message (both body and sets the needed headers).

    Parameters
    ----------
    sender_name: string
        The name aka phrase of the sender.
    sender_email: string
        The sender's email address.
    to_addr, cc_addr, bcc_addr: list of pairs (name, email_address), or None
        Message recipients.
    subject : string
        a utf-8 encoded string
    html : string
        a utf-8 encoded string
    in_reply_to: string or None
        If this message is a reply, the Message-Id of the message being replied
        to.
    references: list or None
        If this message is a reply, the Message-Ids of prior messages in the
        thread.
    attachments: list of dicts, optional
        a list of dicts(filename, data, content_type)
    """
    html = html if html else ''
    plaintext = html2text(html)

    # Create a multipart/alternative message
    msg = mime.create.multipart('alternative')
    msg.append(mime.create.text('plain', plaintext),
               mime.create.text('html', html))

    # Create an outer multipart/mixed message
    if attachments:
        text_msg = msg
        msg = mime.create.multipart('mixed')

        # The first part is the multipart/alternative text part
        msg.append(text_msg)

        # The subsequent parts are the attachment parts
        for a in attachments:
            # Disposition should be inline if we add Content-ID
            msg.append(
                mime.create.attachment(a['content_type'],
                                       a['data'],
                                       filename=a['filename'],
                                       disposition='attachment'))

    msg.headers['Subject'] = subject if subject else ''

    # Gmail sets the From: header to the default sending account. We can
    # however set our own custom phrase i.e. the name that appears next to the
    # email address (useful if the user has multiple aliases and wants to
    # specify which to send as), see: http://lee-phillips.org/gmailRewriting/
    # For other providers, we simply use name = ''
    from_addr = address.EmailAddress(sender_name, sender_email)
    msg.headers['From'] = from_addr.full_spec()

    # Need to set these headers so recipients know we sent the email to them
    # TODO(emfree): should these really be unicode?
    if to_addr:
        full_to_specs = [
            address.EmailAddress(name, spec).full_spec()
            for name, spec in to_addr
        ]
        msg.headers['To'] = u', '.join(full_to_specs)
    if cc_addr:
        full_cc_specs = [
            address.EmailAddress(name, spec).full_spec()
            for name, spec in cc_addr
        ]
        msg.headers['Cc'] = u', '.join(full_cc_specs)
    if bcc_addr:
        full_bcc_specs = [
            address.EmailAddress(name, spec).full_spec()
            for name, spec in bcc_addr
        ]
        msg.headers['Bcc'] = u', '.join(full_bcc_specs)

    add_inbox_headers(msg, inbox_uid)

    if in_reply_to:
        msg.headers['In-Reply-To'] = in_reply_to
    if references:
        msg.headers['References'] = '\t'.join(references)

    rfcmsg = _rfc_transform(msg)

    return rfcmsg

Example #37

0

Show file

File: manage.py Project: ricardoChen93/Zhihu_Demo

def deploy():
    db.drop_all()
    db.create_all()

    users = [('*****@*****.**', u'知乎小管家', 'password'),
             ('*****@*****.**', u'Jack', 'password'),
             ('*****@*****.**', u'Jim', 'password'),
             ('*****@*****.**', u'麻花疼', 'password'),
             ('*****@*****.**', u'丁磊', 'password'),
             ('*****@*****.**', u'张家玮', 'password'),
             ('*****@*****.**', u'李开复', 'password'),
             ('*****@*****.**', u'张小北', 'password'),
             ('*****@*****.**', u'采铜', 'password'),
             ('*****@*****.**', u'张亮', 'password'),
             ('*****@*****.**', u'周晓农', 'password'),
             ('*****@*****.**', u'李楠', 'password'),
             ('*****@*****.**', u'马伯庸', 'password'),
             ('*****@*****.**', u'笑道人', 'password'),
             ('*****@*****.**', u'谢熊猫君', 'password')]
    for user in users:
        u = User(email=user[0], nickname=user[1], password=user[2])
        u.username = create_username(u.nickname)
        db.session.add(u)
    db.session.commit()

    users = User.query.all()
    for user in users:
        other_users = users[:]
        other_users.remove(user)
        user2 = choice(other_users)
        user.follow_user(user2)
    db.session.commit()

    with open('zhihu_questions.pk', 'rb') as f:
        infos = pk.load(f)
    users = User.query.all()
    i = 0
    while i < len(infos):
        q_html = infos[i]['detail']
        title = infos[i]['title']
        try:
            if i == 0:
                question = Question(user=users[0],
                                    title=title,
                                    content=html2text(q_html),
                                    content_html=q_html)
                db.session.add(question)
                db.session.commit
                a_html = infos[i]['answers'][0]
                answer = Answer(author=users[0],
                                question=question,
                                content=html2text(a_html),
                                content_html=a_html)
                db.session.add(answer)
                db.session.commit()
                feed1 = Feed(user=users[0],
                             action="ask_question",
                             question=question)
                feed2 = Feed(user=users[0],
                             action="answer_question",
                             question=question,
                             answer=answer)
                db.session.add_all([feed1, feed2])
                db.session.commit()
            else:
                q_html = infos[i]['detail']
                title = infos[i]['title']
                prev_question = Question.query.order_by(
                    Question.id.desc()).first()
                id_plus = randint(1, 4)
                question_id = prev_question.id + id_plus
                asker = choice(users)
                question = Question(id=question_id,
                                    user=asker,
                                    title=title,
                                    content=html2text(q_html),
                                    content_html=q_html)
                db.session.add(question)
                db.session.commit()
                feed1 = Feed(user=asker,
                             action="ask_question",
                             question=question)
                db.session.add(feed1)
                db.session.commit()
                answerers = users[:]
                j = 0
                while j < len(infos[i]['answers']):
                    answerer = choice(answerers)
                    a_html = infos[i]['answers'][j]
                    answer = Answer(author=answerer,
                                    question=question,
                                    content=html2text(a_html),
                                    content_html=a_html)
                    db.session.add(answer)
                    db.session.commit()
                    feed2 = Feed(user=answerer,
                                 action="answer_question",
                                 question=question,
                                 answer=answer)
                    db.session.add(feed2)
                    db.session.commit()
                    answerers.remove(answerer)
                    j += 1
        except Exception:
            continue
        i += 1
        print(u'第%s个问题已收录' % i)

Example #38

0

Show file

def get_text():
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content, "html.parser")
    text = str(page_content)
    raw_text = html2text.html2text(text)
    return (raw_text)

Example #39

0

Show file

 def content(self):
     content = html2text.html2text(self.html)
     # content = content.replace('\n', '<br />')
     return content

Example #40

0

Show file

File: email_service.py Project: cheevahagadog/barker

 def load_newsletter_html():
     """Loads html newsletter and returns a html file and a text file to be used by email service"""
     with codecs.open("app/templates/newsletter.html", 'r', 'utf-8') as f:
         html = f.read()
     text = html2text.html2text(html)
     return text, html

Example #41

0

Show file

File: ao3.py Project: michelleful/WritingAdviceAdvice

def html2markdown(text):
    """
    Convert html text to markdown and replace special unicode characters
    """
    return remove_unicode(html2text.html2text(text.decode('utf-8')))

Example #42

0

Show file

    def get_findings(self, filename, test):
        content = filename.read()
        if type(content) is bytes:
            content = content.decode('utf-8')
        raw_data = content.replace("aws_info =", "")
        data = json.loads(raw_data)
        find_date = datetime.now()
        dupes = {}

        test_description = ""
        aws_account_id = data["aws_account_id"]
        test_description = "%s  **AWS Account:** %s\n" % (test_description,
                                                          aws_account_id)
        last_run = data["last_run"]
        test_description = "%s  **Ruleset:** %s\n" % (test_description,
                                                      last_run["ruleset_name"])
        test_description = "%s  **Ruleset Description:** %s\n" % (
            test_description, last_run["ruleset_about"])
        test_description = "%s  **Command:** %s\n" % (test_description,
                                                      last_run["cmd"])

        # Summary for AWS Services
        test_description = "%s\n**AWS Services** \n\n" % (test_description)
        for service, items in list(last_run["summary"].items()):
            test_description = "%s\n**%s** \n" % (test_description,
                                                  service.upper())
            test_description = "%s\n* **Checked Items:** %s\n" % (
                test_description, items["checked_items"])
            test_description = "%s* **Flagged Items:** %s\n" % (
                test_description, items["flagged_items"])
            test_description = "%s* **Max Level:** %s\n" % (test_description,
                                                            items["max_level"])
            test_description = "%s* **Resource Count:** %s\n" % (
                test_description, items["resources_count"])
            test_description = "%s* **Rules Count:** %s\n\n" % (
                test_description, items["rules_count"])
        test.description = test_description
        test.save()

        scout2_findings = []

        # Configured AWS Services
        for service in list(data["services"].items()):
            for service_item in service:
                if "findings" in service_item:
                    for name, finding in list(
                            service_item["findings"].items()):
                        if finding["items"]:
                            description_text = ""
                            for name in finding["items"]:
                                description_text = description_text + "**Location:** " + name + "\n\n---\n"
                                description_text = description_text + "\n"
                                key = name.split('.')
                                i = 1
                                lookup = service_item
                                while i < len(key):
                                    if key[i] in lookup:
                                        if (type(lookup[key[i]]) is dict):
                                            lookup = lookup[key[i]]
                                            if (key[i - 1] == "security_groups"
                                                    or key[i - 1]
                                                    == "PolicyDocument"):
                                                break
                                    i = i + 1

                                self.recursive_print(lookup)
                                description_text = description_text + self.item_data
                                self.item_data = ""

                            mobsf_item = {
                                "category": "Mobile Permissions",
                                "title": finding["description"],
                                "severity": finding["level"],
                                "description": description_text
                            }
                            scout2_findings.append(mobsf_item)

        for scout2_finding in scout2_findings:
            title = html2text(scout2_finding["title"])
            sev = self.getCriticalityRating(scout2_finding["severity"])
            description = scout2_finding["description"]
            dupe_key = sev + title
            if dupe_key in dupes:
                find = dupes[dupe_key]
                if description is not None:
                    find.description += description
            else:
                find = Finding(
                    title=textwrap.shorten(title, 150),
                    cwe=
                    1032,  # Security Configuration Weaknesses, would like to fine tune
                    test=test,
                    description="**AWS Account:** " + aws_account_id + "\n" +
                    description,
                    severity=sev,
                    references=None,
                    date=find_date,
                    dynamic_finding=True)
                dupes[dupe_key] = find
        return list(dupes.values())

Example #43

0

Show file

File: html_fetch.py Project: penn201500/alfred-markdown-notes

 def fetchMd(self):
     return html2text.html2text(self.fetchHtml())

Example #44

0

Show file

File: Phen-classify-revised-ALL_v3.py Project: leng1000/pFINDR

def readinput2(input_file, DescIdx):

    # Demographic keywords
    Demographics = [
        "marital", "married", "unmarried", "single", "separated", "engaged",
        "divorced", "widowed", "widow", "widower", "domestic partnership",
        "unmarried partner", "cohabiting", "civil union", "education",
        "graduate", "occupation", "birthplace", "salary", "relationship"
    ]

    DemographicsIS = ["single", "separated"]

    # Keep SNOMED-CT CUI finding codes
    SNOMED = {}
    #SNOMED_source = '/home/sondoan/pfindr/VariableStandadization/phen-classification/data/MRCONSO_SNOMEDCT_finding_unique.ID'
    SNOMED_source = '/home/sodoan/pfindr/Normalization/Abbreviation/MRCONSO_SNOMEDCT_finding_unique.ID'
    f1 = open(SNOMED_source, 'r')
    for item1 in f1.readlines():
        if len(item1.strip()) > 0:
            SNOMED[item1.strip()] = 1
    f1.close()

    fin = open(input_file, 'r')
    for items in fin.readlines():
        item = items.split(':::')

        #PhenDesc = item[0].strip()

        # Default DescIdx is 1, is the index of phenotype description
        # DescIdx = 1
        PhenDesc = item[DescIdx].strip()
        if PhenDesc.find('<a href>') >= 0:
            PhenDesc = html2text.html2text(PhenDesc)

        Theme = item[DescIdx + 1].strip()
        ThemePCN = item[DescIdx + 2]
        ThemeCUI = item[DescIdx + 3]
        ThemeSem = item[DescIdx + 4]

        TopicPCN = item[DescIdx + 5]
        TopicCUI = item[DescIdx + 6]
        TopicSem = item[DescIdx + 7]

        SOIPCN = item[DescIdx + 8]
        SOICUI = item[DescIdx + 9]
        SOISem = item[DescIdx + 10]

        # ==========================================
        # Remove LOINC code from TopicPCN
        TopicPCNL1 = TopicPCN.split(';')
        TopicCUIL1 = TopicCUI.split(';')
        TopicSemL1 = TopicSem.split(';')

        tempPCN = ''
        tempCUI = ''
        tempSem = ''
        iPCN = 0

        for iTopic in TopicPCNL1:
            #print iPCN
            #print len(iTopic.strip())
            if not mapLOINC(iTopic) and len(iTopic.strip()) > 0:
                tempPCN += iTopic + ';'
                tempCUI += TopicCUIL1[iPCN] + ';'
                tempSem += TopicSemL1[iPCN] + ';'
                #print tempPCN
                #print tempCUI
            iPCN += 1

        TopicPCN = tempPCN.strip(';')
        TopicCUI = tempCUI.strip(';')
        TopicSem = tempSem.strip(';')

        # ==========================================
        # Remove Excluded CUIs
        CUIEx = [
            "C0555047", "C0087136", "C1549113", "C0682073", "C0086170",
            "C0206275", "C0425152", "C0425164", "C0682187", "C0013658",
            "C0337664", "C0337676", "C0337677", "C0337679", "C0560184",
            "C2699517", "C1558950", "C0579133", "C0750479", "C0238884",
            "C1550043", "C0449255", "C2053594", "C0011900", "C1299586",
            "C1704632", "C0518459", "C0013798", "C0849912", "C0476610",
            "C1287845", "C2825142", "C1832071", "C0016928", "C0518461",
            "C1832073", "C2004062", "C1444656", "C0496675", "C0262926",
            "C1657765", "C0240320", "C2970713", "C1820407", "C1444648",
            "C1955473", "C1509143", "C0516977", "C1514241", "C0848632",
            "C1705236", "C1705179", "C2826292", "C2826292", "C0871269",
            "C0518462", "C0449416", "C1301826", "C0429103", "C0427693",
            "C1363945", "C0040210", "C1299582", "C1273517", "C0439540",
            "C1444647", "C0234766", "C2926735", "C0682295"
        ]

        # List of PRODUCT names
        CUIEx1 = [
            'C2926735', 'C0308903', 'C0308902', 'C0310197', 'C0722923',
            'C2348077'
        ]

        TopicPCNL1 = TopicPCN.split(';')
        TopicCUIL1 = TopicCUI.split(';')
        TopicSemL1 = TopicSem.split(';')

        iPCN = 0
        for iCUI in TopicCUIL1:

            if mapList(iCUI, CUIEx1):
                TopicPCNL1.remove(TopicPCNL1[iPCN])
                TopicCUIL1.remove(iCUI)
                TopicSemL1.remove(TopicSemL1[iPCN])

            iPCN += 1

        TopicPCN = ';'.join(TopicPCNL1[0:]).strip()
        TopicCUI = ';'.join(TopicCUIL1[0:]).strip()
        TopicSem = ';'.join(TopicSemL1[0:]).strip()

        # ==========================================
        # Keep SNOMED-CT finding only  -- Filter by SNOMED CT, just keep finding in SNOMED-CT from UMLS findings

        # Keep Topic

        TopicPCNL1 = TopicPCN.split(';')
        TopicCUIL1 = TopicCUI.split(';')
        TopicSemL1 = TopicSem.split(';')

        iPCN = 0
        for iCUI in TopicSemL1:
            if iCUI.find('fndg') >= 0:
                # Check if it exists in SNOMED List
                if not SNOMED.has_key(TopicCUIL1[iPCN].strip()):
                    # Remove item from the list
                    TopicSemL1.remove(iCUI)
                    TopicCUIL1.remove(TopicCUIL1[iPCN])
                    TopicPCNL1.remove(TopicPCNL1[iPCN])
            iPCN += 1

        TopicPCN = ';'.join(TopicPCNL1[0:]).strip()
        TopicCUI = ';'.join(TopicCUIL1[0:]).strip()
        TopicSem = ';'.join(TopicSemL1[0:]).strip()

        # ----------------------------
        phenCategory = []

        # RULE STARTING

        ### Type: Demographic
        #if mapList(PhenDesc.lower(),Demographics) or len(Theme)>0:
        #if Theme!='NULL' and len(ThemePCN)>0:
        #	if not 'Demographics' in phenCategory:
        #		phenCategory.append('Demographics')

        # Modified on July 11, 2014
        Patient = ['C0030705', 'C0679646', 'C0681850']
        if Theme != 'NULL' and len(ThemePCN) > 0:
            if mapList(SOICUI, Patient):
                phenCategory.append('Demographics Patient')
            else:
                phenCategory.append('Demographics Family')

        ### Type: Medication
        MedPatient = ['C0030705', 'C0679646', 'C0681850']
        if mapList(SOICUI,
                   MedPatient) and (mapList(TopicSem, ['phsu']) or
                                    PhenDesc.lower().find('medication') >= 0):
            if not 'Medication Patient' in phenCategory:
                phenCategory.append('Medication Patient')

        if not mapList(SOICUI, MedPatient) and (
                mapList(TopicSem, ['phsu'])
                or PhenDesc.lower().find('medication') >= 0):
            if not 'Medication Family' in phenCategory:
                phenCategory.append('Medication Family')

        ### Type: Lab Test
        LabTest = ['C0030705', 'C0679646', 'C0681850']
        if mapList(SOICUI, LabTest) and mapList(TopicSem, ['lbpr']):
            if not 'Lab Tests Patient' in phenCategory:
                phenCategory.append('Lab Tests Patient')

        if not mapList(SOICUI, LabTest) and mapList(TopicSem, ['lbpr']):
            if not 'Lab Tests Family' in phenCategory:
                phenCategory.append('Lab Tests Family')

        ### Type: Mental or Emotional Finding
        MentalFinding = ['menp']
        if mapList(SOICUI, LabTest) and mapList(TopicSem, MentalFinding):
            if not 'Mental or Emotional Finding' in phenCategory:
                phenCategory.append('Mental or Emotional Finding')

        if not mapList(SOICUI, LabTest) and mapList(TopicSem, MentalFinding):
            if not 'Mental or Emotional Finding Family' in phenCategory:
                phenCategory.append('Mental or Emotional Finding Family')

        ### Type: Smoking History
        SmokingHistory = [
            "smoke", "smoking", "smoker", "tobacco", "cigarette", "pipe",
            "cigar", "nicotine"
        ]

        if mapList(SOICUI, LabTest) and mapList(PhenDesc.lower(),
                                                SmokingHistory):
            if not 'Smoking History' in phenCategory:
                phenCategory.append('Smoking History')

        if not mapList(SOICUI, LabTest) and mapList(PhenDesc.lower(),
                                                    SmokingHistory):
            if not 'Smoking History Family' in phenCategory:
                phenCategory.append('Smoking History Family')

        ### Type: Drinking History
        DrinkingEx = ["C0337676", "C0337677", "C0337679"]
        DrinkingHistory = [
            "drink", "drinker", "alcohol", "liquor", "drunk", "beer", "wine",
            "drinking"
        ]

        if mapList(
                SOICUI,
                LabTest) and PhenDesc.lower().find('drinking function') == -1:
            if mapList(PhenDesc.lower(),
                       DrinkingHistory) and not mapList(TopicCUI, DrinkingEx):
                if not 'Drinking History' in phenCategory:
                    phenCategory.append('Drinking History')

        if not mapList(SOICUI, LabTest) and PhenDesc.lower().find(
                'drinking function') == -1:
            if mapList(PhenDesc.lower(),
                       DrinkingHistory) and not mapList(TopicCUI, DrinkingEx):
                if not 'Drinking History Family' in phenCategory:
                    phenCategory.append('Drinking History Family')

        ### Type: Substance Use History
        SubstanceUseHistory = [
            "cocaine", "opiate", "stimulant", "marijuana", "pot", "cannabis"
        ]

        ExSubstance = [
            'smoke', 'smoking', 'smoker', 'tobacco', 'cigarette', 'pipe',
            'cigar', 'nicotine'
        ]

        if mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), SubstanceUseHistory) or mapList(
                    TopicSem, ['hops']):
                if not mapList(PhenDesc, ExSubstance):
                    if not 'Substance Use History' in phenCategory:
                        phenCategory.append('Substance Use History')

        if not mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), SubstanceUseHistory) or mapList(
                    TopicSem, ['hops']):
                if not mapList(PhenDesc, ExSubstance):
                    if not 'Substance Use History Family' in phenCategory:
                        phenCategory.append('Substance Use History Family')

        ### Type: Eating or Nutritional Finding
        Eating = ["food", "vitamin", "nutrition", "water"]

        if mapList(SOICUI, LabTest):
            if mapList(TopicSem, ['food']) or mapList(PhenDesc.lower(),
                                                      Eating):
                if not 'Eating or Nutritional Finding' in phenCategory:
                    phenCategory.append('Eating or Nutritional Finding')

        if not mapList(SOICUI, LabTest):
            if mapList(TopicSem, ['food']) or mapList(PhenDesc.lower(),
                                                      Eating):
                if not 'Eating or Nutritional Finding Family' in phenCategory:
                    phenCategory.append('Eating or Nutritional Finding Family')

        ### Type: Self-care Status
        Selfcare = [
            "selfcare", "self care", "self-care", "dressing", "grooming",
            "bathing", "eating", "toileting", "hygiene"
        ]

        if mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), Selfcare):
                if not 'Self-care Status' in phenCategory:
                    phenCategory.append('Self-care Status')

        if not mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), Selfcare):
                if not 'Self-care Status' in phenCategory:
                    phenCategory.append('Self-care Status Family')

        ### Type: Healthcare Activity Finding
        Healthcare = [
            "medical care", "hospital", "appointment", "follow up", "f/u",
            "follow-up", "visit", "encounter", "service"
        ]

        #if mapList(SOICUI,LabTest):
        #	if mapList(TopicSem,['hlca']) or mapList(PhenDesc.lower(),Healthcare):
        #		if not 'Healthcare Activity Finding' in phenCategory:
        #			phenCategory.append('Healthcare Activity Finding')

        #if not mapList(SOICUI,LabTest):
        #	if mapList(TopicSem,['hlca']) or  mapList(PhenDesc.lower(),Healthcare):
        #		if not 'Healthcare Activity Finding Family' in phenCategory:
        #			phenCategory.append('Healthcare Activity Finding Family')

        if mapList(PhenDesc.lower(), Healthcare):
            if not 'Healthcare Encounter' in phenCategory:
                phenCategory.append('Healthcare Encounter')

        ## Type: Therapeutic or Preventive Procedure
        if mapList(SOICUI, LabTest) and mapList(TopicSem, ['topp']):
            if not 'Therapeutic or Preventive Procedure' in phenCategory:
                phenCategory.append('Therapeutic or Preventive Procedure')

        if not mapList(SOICUI, LabTest) and mapList(TopicSem, ['topp']):
            if not 'Therapeutic or Preventive Procedure Family' in phenCategory:
                phenCategory.append(
                    'Therapeutic or Preventive Procedure Family')

        ### Type: Clinical Attributes

        ClinicalAttL = [
            "gestational age", "basal metabolic rate", "body surface area",
            "blood pressure", "body mass index", "body weight",
            "diastolic blood pressure", "heart rate", "height",
            "respiration rate", "systolic blood pressure", "temperature",
            "temperature, pulse, respiration", "weight", "vital sign",
            "body temperature", "pulse rate", "systolic pressure",
            "diastolic pressure", "resting pressure", "pulse pressure",
            "heartbeat", "birth weight", "body fat distribution", "adiposity",
            "waist circumference", "waist-hip ratio", "head circumference",
            "chest circumference", "pulse", "respiratory depth",
            "pulse deficit", "pain", "oxygen saturation", "pupil size",
            "pupil equality", "pupil reactivity to light", "pulse oximetry",
            "diameter", "perimeter", "systolic", "diastolic", "visual acuity"
        ]

        if mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), ClinicalAttL) and not mapList(
                    PhenDesc.lower(), ['weighting', 'weighted']):
                if not 'Clinical Attributes' in phenCategory:
                    phenCategory.append('Clinical Attributes')

        if not mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), ClinicalAttL):
                if not 'Clinical Attributes Family' in phenCategory and not mapList(
                        PhenDesc.lower(), ['weighting', 'weighted']):
                    phenCategory.append('Clinical Attributes Family')

        ### Type: Research Attributes
        ResearchTerms = [
            "control group", "control status", "case", "case control",
            "case-control", "protocol"
        ]

        if mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), ResearchTerms) or mapList(
                    TopicSem, ['resa']):
                if not 'Research Attributes' in phenCategory:
                    phenCategory.append('Research Attributes')

        if not mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), ResearchTerms) or mapList(
                    TopicSem, ['resa']):
                if not 'Research Attributes Family' in phenCategory:
                    phenCategory.append('Research Attributes Family')

        ## REMOVE CO-OCCURENCE Types, e.g., Daily or Recreation Activity doesnot occurs with Clinical Attributes, Lab Test, Diagnostic Procedure

        # If Diagnostic Procedure co-occurs with Clinical Attribute, then ignore.
        ### Type: Diagnostic Procedure
        Diagnosis = [
            'ecg', 'electrocardiogram', 't wave', 't-wave', 'wave feature',
            'qrs', 'rr interval', 'r wave', 'p wave', 'q duration', 's wave'
        ]
        if mapList(SOICUI, LabTest):
            if mapList(TopicSem, ['diap']) or mapList(PhenDesc.lower(),
                                                      Diagnosis):
                if not 'Clinical Attributes' in phenCategory and not 'Clinical Attributes Family' in phenCategory:
                    if not 'Diagnostic Procedure' in phenCategory:
                        phenCategory.append('Diagnostic Procedure')

        if not mapList(SOICUI, LabTest):
            if mapList(TopicSem, ['diap']) or mapList(PhenDesc.lower(),
                                                      Diagnosis):
                if not 'Clinical Attributes' in phenCategory and not 'Clinical Attributes Family' in phenCategory:
                    if not 'Diagnostic Procedure Family' in phenCategory:
                        phenCategory.append('Diagnostic Procedure Family')

        ### Type: Daily or Recreational Activity
        Activity = [
            "gait", "walking", "exercise", "sport", "workout", "gambling",
            "sleep", "toilet", "chore", "stand", "eat out"
        ]

        if mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), Activity) or mapList(
                    TopicSem, ['dora']):
                if not 'Clinical Attributes' in phenCategory and not 'Clinical Attributes Family' in phenCategory and not 'Lab Test' in phenCategory and not 'Lab Test Family' in phenCategory and not 'Diagnostic Procedure' in phenCategory and not 'Diagnostic Procedure Family' in phenCategory:
                    if not 'Daily or Recreational Activity' in phenCategory:
                        phenCategory.append('Daily or Recreational Activity')

        if not mapList(SOICUI, LabTest):
            if mapList(PhenDesc.lower(), Activity) or mapList(
                    TopicSem, ['dora']):
                if not 'Clinical Attributes' in phenCategory and not 'Clinical Attributes Family' in phenCategory and not 'Lab Test' in phenCategory and not 'Lab Test Family' in phenCategory and not 'Diagnostic Procedure' in phenCategory and not 'Diagnostic Procedure Family' in phenCategory:
                    if not 'Daily or Recreational Activity Family' in phenCategory:
                        phenCategory.append(
                            'Daily or Recreational Activity Family')

        # IF Medical History co-occurs with any of (Daily or Recreational Activity, Eating or Nutritional Finding, Drinking History) then ignore (i.e., drop Medical History from the assigned types).

        ### Type: Medical History
        MedHist = [
            'dsyn', 'neop', 'sosy', 'acab', 'anab', 'biof', 'cgab', 'fndg',
            'inpo', 'orgf', 'patf', 'phsf', 'mobd'
        ]

        TopicL1 = TopicCUI.split(';')
        TopicS1 = TopicSem.split(';')
        idx1 = 0

        #print TopicS1

        for iTopic in TopicL1:
            if mapList(SOICUI, LabTest) and mapList(TopicS1[idx1], MedHist):
                if not 'Daily or Recreational Activity' in phenCategory and not 'Daily or Recreational Activity Family' in phenCategory and not 'Eating or Nutritional Finding' in phenCategory and not 'Eating or Nutritional Finding Family' in phenCategory and not 'Drinking History' in phenCategory and not 'Drinking History Family' in phenCategory:
                    if not 'Medical History' in phenCategory:
                        phenCategory.append('Medical History')

            if not mapList(SOICUI, LabTest) and mapList(
                    TopicS1[idx1], MedHist):
                if not 'Daily or Recreational Activity' in phenCategory and not 'Daily or Recreational Activity Family' in phenCategory and not 'Eating or Nutritional Finding' in phenCategory and not 'Eating or Nutritional Finding Family' in phenCategory and not 'Drinking History' in phenCategory and not 'Drinking History Family' in phenCategory:
                    if not 'Medical History Family' in phenCategory:
                        phenCategory.append('Medical History Family')
            idx1 += 1

        # End the rules
        # =========================================================================

        # PRINT OUT THE MAPPING

        phenCatStr = ';'.join(phenCategory[0:])
        #print item
        #print "======="
        #print phenCatStr
        #print "======="

        # Print to Excel file
        ExcelOut = '\t'.join(item[0:]).strip() + '\t' + phenCatStr

        # Print to text file
        #ExcelOut = ':::'.join(item[0:]).strip() + ':::' + TopicPCN + ':::' + TopicCUI + ':::' + TopicSem + ':::' +  phenCatStr

        print ExcelOut

    fin.close()

Example #45

0

Show file

File: domainduplicate.py Project: hayj/DomainDuplicate

 def hash(self, url, title, html):
     if html is None:
         html = ""
     domain = self.urlParser.getDomain(url)
     text = str(domain) + "_" + str(title) + "_" + html2text.html2text(html)
     return md5(text)

Example #46

0

Show file

File: quota_notify.py Project: chinthakadeshapriya/openemail-1

if r.get('QW_HTML'):
    try:
        template = Template(r.get('QW_HTML'))
    except:
        print(
            "Error: Cannot parse quarantine template, falling back to default template."
        )
        with open('/templates/quota.tpl') as file_:
            template = Template(file_.read())
else:
    with open('/templates/quota.tpl') as file_:
        template = Template(file_.read())

html = template.render(username=username, percent=percent)
text = html2text.html2text(html)

try:
    msg = MIMEMultipart('alternative')
    msg['From'] = r.get('QW_SENDER') or "quota-warning@localhost"
    msg['Subject'] = r.get('QW_SUBJ') or "Quota warning"
    msg['Date'] = formatdate(localtime=True)
    text_part = MIMEText(text, 'plain', 'utf-8')
    html_part = MIMEText(html, 'html', 'utf-8')
    msg.attach(text_part)
    msg.attach(html_part)
    msg['To'] = username
    p = Popen([
        '/usr/lib/dovecot/dovecot-lda', '-d', username, '-o',
        '"plugin/quota=maildir:User quota:noenforcing"'
    ],

Example #47

0

Show file

File: log_interactions.py Project: abeusher/dayonelib

    rows = cur.fetchall()

    for row in rows:
        entry_date = row['Modified']

        entry = dayonelib.DayOneEntry()
        entry.time = entry_date
        entry.tags = ['pagico', 'interaction']

        # unseralize the body of the note
        row_content = loads(row['Content'])

        # entry body text
        entry_text = "%s" % (row_content['Body'])
        entry_text = html2text.html2text(entry_text)

        # entry title
        entry_title = row_content['Title']

        # All notes on a contact will have a parent. Skip anything without a parent
        if row["ParentID"] is not None:
            # Get contact info
            parent_query = 'SELECT * FROM mach WHERE UID="%s"' % (
                row['ParentID'])
            cur.execute(parent_query)
            parent = cur.fetchone()
            parent_content = loads(parent['content'])

            # Make sure the parent is a contact(type Profile)
            if parent['Type'] == 'Profile':

Example #48

0

Show file

File: util.py Project: AshishNamdev/cfcli

def get_mark(html):
    return mdv.main(ht.html2text(html))

Example #49

0

Show file

File: test_scrap.py Project: potzenhotz/py_finance

import urllib.request as urllib
import bs4
import html2text

url ='https://de.finance.yahoo.com/q/ao?s=GILD'

beautiful = urllib.urlopen(url).read()
soup = bs4.BeautifulSoup(beautiful, 'lxml')

'''
with open('out.txt', 'w') as f:
    f.write(soup.prettify())
'''

txt = html2text.html2text(soup.get_text())
str1 = "Empfehlung (diese Woche):";
len_val = 3
str1_and_value = (txt[txt.find(str1):txt.find(str1) + len(str1) + len_val]
str1_value = txt[txt.find(str1)+ len(str1):txt.find(str1) + len(str1) + len_val]

Example #50

0

Show file

 def summary_txt(self):
     if self.communication_type == "individual":
         return self.summary
     else:
         return html2text.html2text(self.summary)

Example #51

0

Show file

File: downloadfilesweather.py Project: miwateranishi/APAC-Energy-Project

            url = Goldcoasturl
            vm = "Mtippett-vm2"
        elif City == "Adelaide":
            url = Adelaideurl
            vm = "Mtippett-vm3"
        elif City == "Hobart":
            url = Hobarturl
            vm = "Apac-intern-vm3"
        elif City == "Melbourne":
            url = Melbourneurl
            vm = "Apac-intern-vm3"
        elif City == "Perth":
            url = Perthurl
            vm = "Mtippett-vm1"

        filename = folder + "/" + City + "/" + datafile + ".txt"
        if not os.path.exists(filename):
            print "Copying from " + url + " to " + filename
            html2text.BODY_WIDTH = 0  # displays word wrapping so no line breaks are made in the middle of a line.
            html_content = urllib2.urlopen(url).read()  # read the html page
            rendered_content = html2text.html2text(
                html_content
            )  # convert it to text and save to variable rendered_content
            f = open(filename, 'w')
            f.write(rendered_content)  #save the txt file
            f.close()
            copyfile(
                filename,
                "//" + vm + "//c$//Users//thanh//Desktop//Weather_Data//" +
                City + "//" + datafile + ".txt")

Example #52

0

Show file

File: test_html2text.py Project: snarfed/html2text

def test_function(fn, func_args):
    with open(fn) as inf:
        actual = html2text.html2text(inf.read(), **func_args)
    result = get_baseline(fn)
    assert result == actual

Example #53

0

Show file

from urllib import request

from bs4 import BeautifulSoup
from html2text import html2text


def retrieve_article(link):
    html_doc = request.urlopen(link).read().decode('utf-8')
    soup = BeautifulSoup(html_doc, 'html.parser')
    return soup.body.find(id="content").article


# if a link matches Kata{0-9}{0-9}
grammar = re.compile("Kata\d\d")

# root of kata links
html_doc = request.urlopen("http://codekata.pragprog.com/").read().decode(
    'utf-8')
soup = BeautifulSoup(html_doc, 'html.parser')
links = {
    link.get("href")
    for link in soup.find_all('a') if grammar.findall(str(link))
}

# retrieve the text on a kata site and write it as a file
for link in links:
    uri = link.split("/")[-2]
    pathlib.Path(uri).mkdir(exist_ok=True)
    with open("{0}/kata.md".format(uri), "w") as kata:
        kata.write(html2text(str(retrieve_article(link))))

Example #54

0

Show file

File: define.py Project: marklap/Py300

#!/usr/bin/env python

import sys

from definitions import Definitions
from html2text import html2text

title = sys.argv[1]

definition = Definitions.article(title)
txt = html2text(definition)
print txt.encode('utf-8')

Example #55

0

Show file

File: responses.py Project: trostonfernandes/JenniferVirtualAssistant

 def to_text(self):
     return html2text.html2text(self.html)

Example #56

0

Show file

try:
    article.download()
except:
    print("Failed when downloading")
    sys.exit(2)

try:
    article.parse()
except:
    print("Error during parsing article")
    sys.exit(2)

#soup = BeautifulSoup(article.article_html, 'html.parser')
#soup.find('div', id="header").decompose()

data = html2text.html2text(article.article_html)

# print(article.text)
# print(article.title)

print('Writing to post.md')

fd = open('post.md', 'w')

if (article.title):
    fd.write(article.title)
    fd.write("\n\n")

baseURL = getBaseURL(webURL)
viaURL = 'via [' + baseURL + '](' + webURL + ')'

Example #57

0

Show file

 async def server(self, ctx, ip="opmines.net"):
     """
     async with aiohttp.ClientSession() as session:
     async with session.get("https://api.mcsrvstat.us/1/opmines.net", headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}) as resp:
     st = await resp.read()"""
     st = requests.get("https://api.mcsrvstat.us/1/" + ip)
     print(st)
     st = st.text
     print(st)
     serverdata = json.loads(st)
     players = ""
     p = 0
     hasp = False
     if "players" in serverdata:
         if "list" in serverdata['players']:
             hasp = True
             for member in serverdata['players']['list']:
                 p += 1
                 if len(players) != 0:
                     players += " , "
                 if member.lower() in opstaff:
                     players += "**" + member + "** :star:"
                 else:
                     players += member
     embed = discord.Embed(title="Server Status")
     if ip == "opmines.net":
         embed.set_thumbnail(
             url="http://files.enjin.com/780854/opmines%20logo%20222.png")
     embed.add_field(
         name="Status",
         value="Online" if not "offline" in serverdata else "Offline")
     if hasp == True:
         per = math.floor((serverdata['players']['online'] /
                           serverdata['players']['max']) * 1000) / 10
         embed.add_field(name="Members Online",
                         value=str(serverdata['players']['online']) +
                         " / " + str(serverdata['players']['max']) + " (" +
                         str(per) + "%)")
     try:
         embed.add_field(name="Connection Stats",
                         value=serverdata['ip'] + ":" +
                         str(serverdata['port']) + " / " +
                         serverdata['version'],
                         inline=True)
     except:
         embed.add_field(name="Connection Stats",
                         value=serverdata['ip'] + ":" +
                         str(serverdata['port']) + " / ?",
                         inline=True)
     try:
         embed.add_field(name="MOTD",
                         value=html2text.html2text(
                             serverdata['motd']['html'][0]).replace(
                                 "\\", "").replace("</span>", "").replace(
                                     "\n", "").replace("*", "\\*").replace(
                                         "_", "\\_").replace("~", "\\~"))
     except:
         embed.add_field(name="MOTD", value="?")
     try:
         embed.add_field(name="Player List (" + str(p) + ")", value=players)
     except:
         embed.add_field(name="Player List (unable to retrieve)",
                         value="This server may be offline")
     await ctx.send(embed=embed)

Example #58

0

Show file

File: parser.py Project: starling021/django-DefectDojo021

    def get_items(self, tree, vulns, test):
        """
        @return hosts A list of Host instances
        """

        x = list()
        if tree is None:
            return x
        for nodes in tree.iter('nodes'):
            "in nodes"
            for node in nodes.iter('node'):
                host = dict()
                host['name'] = node.get('address')
                host['hostnames'] = set()
                host['os'] = ""
                host['services'] = list()
                # host['vulns'] = self.parse_tests_type(node, vulns)

                for names in node.iter('names'):
                    for name in list(names):
                        host['hostnames'].add(name.text)

                for endpoints in node.iter('endpoints'):
                    for endpoint in list(endpoints):
                        svc = {
                            'protocol': endpoint.get('protocol'),
                            'port': endpoint.get('port'),
                            'status': endpoint.get('status'),
                        }
                        for services in endpoint.iter('services'):
                            for service in list(services):
                                svc['name'] = service.get('name')
                                svc['vulns'] = self.parse_tests_type(
                                    service, vulns)

                                for configs in service.iter('configurations'):
                                    for config in list(configs):
                                        if "banner" in config.get('name'):
                                            svc['version'] = config.get('name')

                        host['services'].append(svc)

                x.append(host)

        dupes = {}

        for item in x:
            for service in item['services']:
                for vuln in service['vulns']:
                    for sev, num_sev in Finding.SEVERITIES.items():
                        if num_sev == vuln['severity']:
                            break

                    dupe_key = sev + vuln['name']

                    if dupe_key in dupes:
                        find = dupes[dupe_key]
                        dupe_text = html2text.html2text(vuln['pluginOutput'])
                        if dupe_text not in find.description:
                            find.description += "\n\n" + dupe_text
                    else:
                        refs = ''
                        for ref in vuln['refs'][2:]:
                            if ref.startswith('CA'):
                                ref = "https://www.cert.org/advisories/" + ref + ".html"
                            elif ref.startswith('CVE'):
                                ref = "https://cve.mitre.org/cgi-bin/cvename.cgi?name=" + ref
                            refs += ref
                            refs += "\n"
                        find = Finding(
                            title=vuln['name'],
                            description=html2text.html2text(
                                vuln['desc'].strip()) + "\n\n" +
                            html2text.html2text(vuln['pluginOutput'].strip()),
                            severity=sev,
                            numerical_severity=Finding.get_numerical_severity(
                                sev),
                            mitigation=html2text.html2text(vuln['resolution']),
                            impact=vuln['refs'][0],
                            references=refs,
                            test=test,
                            active=False,
                            verified=False,
                            false_p=False,
                            duplicate=False,
                            out_of_scope=False,
                            mitigated=None,
                            dynamic_finding=True)
                        find.unsaved_endpoints = list()
                        dupes[dupe_key] = find

                    find.unsaved_endpoints.append(
                        Endpoint(host=item['name'],
                                 product=test.engagement.product))
                    for hostname in item['hostnames']:
                        find.unsaved_endpoints.append(
                            Endpoint(host=hostname,
                                     product=test.engagement.product))
                    for service in item['services']:
                        if len(service['vulns']) > 0:
                            find.unsaved_endpoints.append(
                                Endpoint(
                                    host=item['name'] + (":" + service['port'])
                                    if service['port'] is not None else "",
                                    product=test.engagement.product))

        return list(dupes.values())

Example #59

0

Show file

File: sendmails.py Project: LucaBongiorni/random-scripts-1

      .replace("{date}",datetime.datetime.today().strftime("%d/%m/%Y"))\
      .replace("{b64email}",base64.b64encode(email))\
      .replace("{b64remail}",base64.b64encode(email)[::-1])

    if re.search("{randomint}", body):
        ri = random.randint(1, 9999999)
        print "Random integer: " + email + " : " + str(ri)
        body = body.replace("{randomint}", str(ri))
        randomints = True
        fp = open(intsfile, "a")
        re.write(email + ":" + str(ri))
        fp.close()

    msg.attach(MIMEText(body, "html"))
    if args.text:
        msg.attach(MIMEText(html2text.html2text(body), 'plain'))

    # Find any embedded images and attach
    attachments = re.findall('src="cid:([^"]+)"', body)
    for attachment in attachments:
        fp = open(attachment, "rb")
        img = MIMEImage(fp.read())
        fp.close()
        img.add_header('Content-ID', attachment)
        msg.attach(img)

    # Optional attachment
    if args.attachment:
        filename = os.path.basename(args.attachment)
        part = MIMEBase('application', "octet-stream")
        part.set_payload(open(args.attachment, "rb").read())

Example #60

0

Show file

File: whatsapp_invoice.py Project: adeel982010/Production-1

 def convert_html_to_text(result_txt):
     capt = b'%s' % (result_txt)
     convert_byte_to_str = capt.decode('utf-8')
     return html2text.html2text(convert_byte_to_str)