def add_to_log(self, action, info=None, level="info"): log = {} log['action'] = strip_tags(action) log['info'] = strip_tags(info) log['level'] = strip_tags(level) log['created'] = int(time.time()) self.db.logs.insert(log, safe=True)
def add_to_log(self, action, info=None, level="info"): log = {} log["action"] = strip_tags(action) log["info"] = strip_tags(info) log["level"] = strip_tags(level) log["created"] = int(time.time()) self.db.logs.insert(log)
def make_sheet_list_by_tag(): """ Returns an alphabetized list of tags and sheets included in each tag. """ tags = {} results = [] sheet_list = db.sheets.find({"status": {"$in": LISTED_SHEETS}}) for sheet in sheet_list: sheet_tags = sheet.get("tags", []) for tag in sheet_tags: if tag not in tags: tags[tag] = {"tag": tag, "count": 0, "sheets": []} tags[tag]["sheets"].append( {"title": strip_tags(sheet["title"]), "id": sheet["id"], "views": sheet["views"]} ) tags[tag]["count"] += 1 for tag in tags.values(): tag["sheets"] = sorted(tag["sheets"], key=lambda x: -x["views"]) results.append(tag) results = sorted(results, key=lambda x: x["tag"]) return results
def main(): users = mongo.get_users(collection) for user in users: chat_id = user['user_id'] oauth = user['lepra_oauth'] feed_type = user.get('feed_type', 'main') threshold_rating = user.get('threshold_rating', 'easy') markpost_read = user.get('markpost_read', 'false') feed = get_feed(oauth, feed_type, threshold_rating) if not feed: continue if feed == 'deny': telegram_bot.get_user_oauth(chat_id, client_id, bot) config.logger.error( "Some auth error. User {}, move to prepare".format(chat_id)) mongo.user_to_prepare(chat_id, collection) continue for key in feed: for post in feed[key]: send_to_user = '' post_id = post['id'] config.logger.error("User id: {}".format(chat_id)) config.logger.error("Post id: {}".format(post_id)) read = mongo.check_lepra_post(post_id, chat_id, posts_collection) if read: config.logger.error("User {} already read post: {}".format( chat_id, post_id)) continue for key in post: if key == 'body': data = post[key] data = util.strip_tags(data) send_to_user = send_to_user + data + '\n' elif key == '_links': data = post[key][0]['href'] send_to_user = send_to_user + data if send_to_user: config.logger.error("Send post {} to user {}".format( post_id, chat_id)) time.sleep(1) result = telegram_bot.send_message(send_to_user, 'text', bot, chat_id) if result: config.logger.error("result is: {}".format(result)) if result == 'ban': config.logger.error( "User {} blocked bot, move to prepare".format( chat_id)) mongo.user_to_prepare(chat_id, collection) continue else: mongo.add_to_lepra_posts(post['id'], chat_id, posts_collection) if markpost_read == 'true': markpost_as_read(post_id, oauth)
def from_string(search_str): """ Takes a candidate string and extracts out the name(s) in list form >>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV' >>> authors_from_string(string) ['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV'] """ # set initial counter initial_count = 0 # clean string search_str = strip_tags(search_str) search_str = re_by.sub('', search_str) search_str = search_str.strip() # tokenize name_tokens = [ s.strip() for s in re_name_token.split(search_str) ] _authors, authors = [], [] curname = [] # List of first, last name tokens for token in name_tokens: # check if the length of the name # and the token suggest an initial if is_initial(curname, token): # upper case initial & increment token = token.upper() initial_count +=1 # if we're at a delimiter, check if the name is complete if token.lower() in DELIM: # check valid name based on initial count if end_name(curname, initial_count): _authors.append(' '.join(curname)) # reset initial_count = 0 curname = [] # otherwise, append token elif not re_digits.search(token): curname.append(token) # One last check at end valid_name = (len(curname) >= MIN_NAME_TOKENS) if valid_name: _authors.append(' '.join(curname)) return format_authors(_authors)
def run_trancxu(self, args): if not args: return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("tranĉu") word = urllib.quote(util.x_to_unicode(args)) url = config.sivo_search % ("ser%c4%89o", word) html = util.get_html(url) html = re.search(r"<h2>Vortfarada Serĉo</h2>(.+?)<h2>", html, re.S).group(1) if "Neniu trovita" in html: return 'Nenio trovita por "%s".' % args else: ret = [util.strip_tags(line) for line in html.splitlines() if "<li>" in line] return "\n".join(ret)
def _displayEntry(self, index): entry = self.container.items[index-1] urls = util.find_urls(entry.content) title = util.unescape(entry.title).replace("\n", ' ').encode('utf-8') content = util.strip_tags(util.unescape(entry.content)).encode('utf-8') print title print content #uniqify the urls for i in list(set(urls)): print ''.join(i)
def from_string(search_str): """ Takes a candidate string and extracts out the name(s) in list form >>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV' >>> authors_from_string(string) ['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV'] """ # set initial counter initial_count = 0 # clean string search_str = strip_tags(search_str) search_str = re_by.sub('', search_str) search_str = search_str.strip() # tokenize name_tokens = [s.strip() for s in re_name_token.split(search_str)] _authors, authors = [], [] curname = [] # List of first, last name tokens for token in name_tokens: # check if the length of the name # and the token suggest an initial if is_initial(curname, token): # upper case initial & increment token = token.upper() initial_count += 1 # if we're at a delimiter, check if the name is complete if token.lower() in DELIM: # check valid name based on initial count if end_name(curname, initial_count): _authors.append(' '.join(curname)) # reset initial_count = 0 curname = [] # otherwise, append token elif not re_digits.search(token): curname.append(token) # One last check at end valid_name = (len(curname) >= MIN_NAME_TOKENS) if valid_name: _authors.append(' '.join(curname)) return format_authors(_authors)
def test_strip_tags(self): self.assertEqual('', util.strip_tags('')) self.assertEqual('ac', util.strip_tags('a<b>c')) self.assertEqual('a<b', util.strip_tags('a<b')) self.assertEqual('a>b', util.strip_tags('a>b')) self.assertEqual('ace', util.strip_tags('a<b>c<d>e')) self.assertEqual('>ace<', util.strip_tags('>a<b>c<d>e<'))
def synopsis(self): """scraps the synopsis from the show's tvrage page using a regular expression. This method might break when the page changes. unfortunatly the episode summary isnt available via one of the xml feeds""" try: page = urlopen(self.link).read() try: summary = re.search(r'<div class="show_synopsis">(.*?)</div>', page, re.MULTILINE | re.DOTALL).group(1) return unicode(strip_tags(summary), 'utf-8').strip() except Exception, e: print('Show.synopsis: %s, %s' % (self, e)) except URLError, e: print('Show.synopsis:urlopen: %s, %s' % (self, e))
def summary(self): """scraps the plot summary episode's tvrage page using a regular expression this method might break when the page changes. unfortunatly the episode summary isnt available via one of the xml feeds""" try: page = urlopen(self.link).read() if not 'Click here to add a summary' in page: try: summary = re.search(r"</script></div><div>(.*?)<br>", page, re.MULTILINE | re.DOTALL).group(1) return unicode(strip_tags(summary), 'utf-8').strip() except Exception, e: print('Episode.summary: %s, %s' % (self, e)) except URLError, e: print('Episode.summary:urlopen: %s, %s' % (self, e))
def synopsis(self): """scraps the synopsis from the show's tvrage page using a regular expression. This method might break when the page changes. unfortunatly the episode summary isnt available via one of the xml feeds""" try: page = urlopen(self.link).read() try: summary = re.search( r'<div class="show_synopsis">(.*?)</div>', page, re.MULTILINE | re.DOTALL).group(1) return unicode(strip_tags(summary), 'utf-8').strip() except Exception, e: print('Show.synopsis: %s, %s' % (self, e)) except URLError, e: print('Show.synopsis:urlopen: %s, %s' % (self, e))
def summary(self): """scraps the plot summary episode's tvrage page using a regular expression this method might break when the page changes. unfortunatly the episode summary isnt available via one of the xml feeds""" try: page = urlopen(self.link).read() if not 'Click here to add a summary' in page: try: summary = re.search( r"</script></div><div>(.*?)<br>", page, re.MULTILINE | re.DOTALL).group(1) return unicode(strip_tags(summary), 'utf-8').strip() except Exception, e: print('Episode.summary: %s, %s' % (self, e)) except URLError, e: print('Episode.summary:urlopen: %s, %s' % (self, e))
def trans_majstro(self, fr, to, word): qword = urllib.quote(word) url = config.majstro_search % (fr, to, qword) html = util.get_html(url) if "could not be translated" in html: return 'Nenio trovita por "%s".' % word results = re.findall(r"<li>.+?</li>", html) ret = "\n".join(results) ret = util.strip_tags(ret) parser = HTMLParser.HTMLParser() ret = ret.decode('utf-8') ret = parser.unescape(ret) if type(ret) == unicode: ret = ret.encode('utf-8') ret = re.sub(": ", " → ", ret) ret = re.sub("; ", ", ", ret) return ret
def source_text(source): """ Recursive function to translate a source dictionary into text. """ content = [ source.get("customTitle", ""), source.get("ref", ""), source.get("text", {"he": ""}).get("he", ""), source.get("text", {"en": ""}).get("en", ""), source.get("comment", ""), source.get("outside", ""), ] content = [strip_tags(c) for c in content] text = " ".join(content) if "subsources" in source: for s in source["subsources"]: text += source_text(s) return text
def source_text(source): """ Recursive function to translate a source dictionary into text. """ content = [ source.get("customTitle", ""), source.get("ref", ""), source.get("text", {"he": ""})["he"], source.get("text", {"en": ""})["en"], source.get("comment", ""), source.get("outside", ""), ] content = [strip_tags(c) for c in content] text = " ".join(content) if "subsources" in source: for s in source["subsources"]: text += source_text(s) return text
def main(): # print(collection) users = mongo.get_users(collection) # print(type(bot)) for user in users: # print(user) chat_id = user['user_id'] oauth = user['lepra_oauth'] feed_type = user.get('feed_type', 'main') threshold_rating = user.get('threshold_rating', 'easy') feed = get_feed(oauth, feed_type, threshold_rating) # print(feed) if feed == 'deny': telegram_bot.get_user_oauth(chat_id, client_id, bot) continue for key in feed: # print key for post in feed[key]: send_to_user = '' post_id = post['id'] config.logger.debug("User id: {}".format(chat_id)) config.logger.debug("Post id: {}".format(post_id)) read = mongo.check_lepra_post(post_id, chat_id, posts_collection) if read: config.logger.debug("User {} already read post: {}".format(chat_id, post_id)) continue for key in post: if key == 'body': data = post[key] data = util.strip_tags(data) send_to_user = send_to_user + data + '\n' elif key == '_links': data = post[key][0]['href'] send_to_user = send_to_user + data if send_to_user: result = telegram_bot.send_message(send_to_user, 'text', bot, chat_id) if result: mongo.add_to_lepra_posts(post['id'], chat_id, posts_collection)
def html_context(self): """ sets self.anchortext and self.context, where the latter is the surrounding text of a link, often containing author, title, publication info; returns self.context There are three main cases: (1) The context we're looking for coincides with the content of a single DOM element (e.g. the <a> itself, or a <li>), possibly minus an abstract, which we can remove afterwards. (2) The context we're looking for coincides with the content of several DOM elements taken together (e.g. "<h4>Title</h4> <div>Forthcoming <a>Penultimate</a></div>"), possibly minus an abstract. (3) The context we're looking for is part of a DOM element that also contains contexts for other entries. E.g., "<a>Paper1</a> Forthcoming<br> <a>Paper2</a>", or "<h4>Paper1</h4> Forthcoming <a>PDF</a> <h4>Paper2</h4> <a>PDF</a>", or "<h4><a>Paper1</a></h4> Forthcoming <h4><a>Paper2</a></h4>". To tell these apart, we first climb up the DOM tree until we reach an element that's too large to be a single paper entry (careful of abstracts here). If the element right below (call it el) has no further text than the link with which we started but there's neighbouring text not in a link, we assume we're in case (3); here we crudely divide el's parent by <br> or <h*> and return the content of the part surrounding el. To tell apart (1) and (2), we use some heuristics to determine whether el's context extends to its siblings: e.g., is there a gap between el and sibling? does the sibling also contain a link to a paper? etc. """ if not self.element: raise Exception("need link element to extract html_context") self.anchortext = self.element.get_attribute('textContent').strip() debug(5, 'trying to find link context') # First climb up DOM until we reach an element (par) that's # too large: el = self.element par = el.find_element_by_xpath('..') debug(5, 'starting with %s', el.get_attribute('outerHTML')) el._text = el.get_attribute('textContent') while (True): debug(5, 'climbing up par: %s', par.get_attribute('outerHTML')) # check if parent has many links or other children par._links = par.find_elements_by_xpath('.//a') par._children = par.find_elements_by_xpath('./*') if len(par._links) > 3 or len(par._children) > 5: debug(5, 'stopping: too many links or children') break # List of drafts may only contain two papers, so we also # check if the previous element was already fairly # large. (We'll still treat such lists as a single context # if the entries are very short, but then that's not a # serious problem because we won't be misled by # publication info that belongs to another entry.) par._text = par.get_attribute('textContent') if len(el._text) > 70 and len(par._text) > len(el._text)*1.5: debug(5, 'stopping: enough text already (%s)', el._text) break try: gpar = par.find_element_by_xpath('..') el,par = par,gpar except Exception: break # If el has no further text than the link with which we # started but there's neighbouring text not in a link, we're # in the messy case (3): if len(el._text) - len(self.element._text) < 5: par._outerHTML = par.get_attribute('outerHTML') el._outerHTML = el.get_attribute('outerHTML') l,r = par._outerHTML.split(el._outerHTML, 2) if re.search(r'\w\s*$', l) or re.search(r'^\s*\w', r): debug(5, 'argh: case (3)') for pat in (r'<h\d.*?>', r'<br>\s*<br>', r'<br>'): parts = re.split(pat, par._outerHTML, flags=re.I) if len(parts) > 1: break for part in parts: if el._outerHTML in part: debug(5, 'surrounding part: %s', part) return util.strip_tags(part) # we should never be here return el._text # Now try to figure out if siblings belong to context: def context_left(i): if par._children.index(el)-i < 0: # can't catch IndexError: careful of negative indices! return '' lsib = par._children[par._children.index(el)-i] lsib_outerHTML = lsib.get_attribute('outerHTML') debug(5, "add left sibling?: %s", lsib_outerHTML) if re.search(r'\.(?:pdf|docx?)\b', lsib_outerHTML, flags=re.I): debug(5, "no: contains link to pdf or doc") return '' lsib_height = int(lsib.get_attribute('offsetHeight')) lsib_text = lsib.get_attribute('textContent') if lsib_text.strip() == '' and lsib_height > 2: debug(5, "no: sibling has no text but takes up space") return '' lsib_bottom = lsib.location['y'] + lsib_height gap = par._children[par._children.index(el)-(i-1)].location['y'] - lsib_bottom if gap > 20 or (gap > 10 and len(context) > 20): debug(5, "no: too far away (%s)", gap) return '' debug(5, "yes, expanding context") return lsib_text def context_right(i): try: rsib = par._children[par._children.index(el)+i] except IndexError: return '' rsib_outerHTML = rsib.get_attribute('outerHTML') debug(5, "add right sibling?: %s", rsib_outerHTML) if re.search(r'\.(?:pdf|docx?)\b', rsib_outerHTML, flags=re.I): debug(5, "no: contains link to pdf or doc") return '' if (len(context) > 20 and not re.search(r'\d{4}|draft|forthcoming', rsib_outerHTML, flags=re.I)): # We're mainly interested in author, title, # publication info. The first two never occur after # the link element (unless that is very short: e.g. an # icon), so we only need to check for the third. debug(5, "no: doesn't look like publication info") return '' rsib_height = int(rsib.get_attribute('offsetHeight')) rsib_text = rsib.get_attribute('textContent') if rsib_text.strip() == '' and rsib_height > 2: debug(5, "no: sibling has no text but takes up space") return '' rsiblsib = par._children[par._children.index(el)+(i-1)] rsiblsib_bottom = rsiblsib.location['y'] + int(rsiblsib.get_attribute('offsetHeight')) gap = rsib.location['y'] - rsiblsib_bottom if gap > 20 or (gap > 10 and len(context) > 20): debug(5, "no: too far away (%s)", gap) return '' debug(5, "yes, expanding context") return rsib_text context = el.get_attribute('textContent') debug(5, "initial context: %s", context) for i in (1,2,3): more = context_right(i) if not more: break context += '\n' + more for i in (1,2,3,4): more = context_left(i) if not more: break context = more + '\n' + context # tidy up slightly (mainly for easier testing): self.context = re.sub(r'\s*\n+\s*', r'\n', context).strip() return self.context
def do_post(balancer, content): balancer.read(strip_tags(content))
def test_strip_tags(): src = "<html>\n<body><p>toto titi.</p>\n tata tutu.</html>" expected = "toto titi. tata tutu." assert_equals(util.strip_tags(src), expected)
def summary(self, length): text = util.strip_tags(self.content) return util.summarize(text, length)
as data_file: data = json.load(data_file)['docs'] new_data = [] for d in data: new = {} if 'categories' in d: cat = [] for c in d['categories']: if 'name' in c: cat.append(c['name']) new['rubrics'] = cat if 'headline' in d: new['headline'] = d['headline'] if 'summary' in d: new['description'] = util.strip_tags(d['summary']) if 'address' in d: if '' address = {} new['address'] = address if 'loc' in d['address']: point = dict(lat=d['address']['loc'][1], lng=d['address']['loc'][0]) new['geometry'] = dict(point=point) address['street'] = d['address'].get('street') address['street_number'] = d['address'].get('number') address['street_type'] = d['address'].get('street_type') address['state'] = d['address'].get('state_name') address['country'] = 'Brazil' address['neighborhood'] = d['address'].get('neighborhood') address['city'] = d['address'].get('city')
def handle(self, *args, **options): Izsek.objects.all().delete() izseki = [] for posnetek in Posnetek.objects.all(): if posnetek.podnapisi: file_path = posnetek.podnapisi.path vseb = codecs.open(file_path,encoding='utf-8',errors='replace').read() stripped = strip_tags(vseb) #has_speakers = len(regex.findall(stripped)) > 0 vsebina_split = sent_tokenize(stripped,language='slovene') #pprint(vsebina_split) last_cas = "" izsek = Izsek() izsek.posnetek = posnetek for split in vsebina_split: if split == "": continue split = split.replace(u"WEBVTT","") nov_cas = [ b.split("-->") for b in split.split('\n') if "-->" in b] split_in = [ b.strip() for b in split.split('\n') if b != "" and "-->" not in b] #pprint(nov_cas) if not last_cas: nov_cas_a = nov_cas.pop(0) last_cas = nov_cas_a[0] if not izsek.zacetek: izsek.zacetek = last_cas sentence = " ".join(split_in).strip() if sentence.startswith((u"-",u"–")) or (len(sentence) + len(izsek.vsebina) > 500): #if len(izsek.vsebina): # print izsek.zacetek,izsek.vsebina izseki.append(izsek) izsek=Izsek() izsek.vsebina = sentence.upper() izsek.zacetek = last_cas izsek.posnetek = posnetek else: izsek.vsebina += " "+sentence.upper() if nov_cas: nov_cas_a = nov_cas.pop() last_cas = nov_cas_a[0] if len(izsek.vsebina): izseki.append(izsek) #pprint(cas) if len(izseki) > 10000: Izsek.objects.bulk_create(izseki) izseki = [] Izsek.objects.bulk_create(izseki)