def __init__(self): HTMLParser.__init__(self) self.withinlinkdiv = -1 self.current_value = 0 # 0 = nothing, 1 = votes, 2 = title this is used to # link the data and the tag self.tempdata = [0, '', '', ''] # votes, title, link temporary, comment link, before it is put # into the submission self.sublist = [] # sublist for the submissions
def __init__(self, ldomain, scandpth, lps): HTMLParser.__init__(self) self.url = ldomain self.db = {self.url: 1} self.node = [self.url] self.depth = scandpth self.max_span = lps self.links_found = 0
def __init__(self, idMembro, cvLattesXML): HTMLParser.__init__(self) # inicializacao obrigatoria self.idMembro = idMembro self.item = '' self.listaIDLattesColaboradores = [] self.listaFormacaoAcademica = [] self.listaProjetoDePesquisa = [] self.listaAreaDeAtuacao = [] self.listaIdioma = [] self.listaPremioOuTitulo = [] self.listaArtigoEmPeriodico = [] self.listaLivroPublicado = [] self.listaCapituloDeLivroPublicado = [] self.listaTextoEmJornalDeNoticia = [] self.listaTrabalhoCompletoEmCongresso = [] self.listaResumoExpandidoEmCongresso = [] self.listaResumoEmCongresso = [] self.listaArtigoAceito = [] self.listaApresentacaoDeTrabalho = [] self.listaOutroTipoDeProducaoBibliografica = [] self.listaSoftwareComPatente = [] self.listaSoftwareSemPatente = [] self.listaProdutoTecnologico = [] self.listaProcessoOuTecnica = [] self.listaTrabalhoTecnico = [] self.listaOutroTipoDeProducaoTecnica = [] self.listaProducaoArtistica = [] self.listaOASupervisaoDePosDoutorado = [] self.listaOATeseDeDoutorado = [] self.listaOADissertacaoDeMestrado = [] self.listaOAMonografiaDeEspecializacao = [] self.listaOATCC = [] self.listaOAIniciacaoCientifica = [] self.listaOAOutroTipoDeOrientacao = [] self.listaOCSupervisaoDePosDoutorado = [] self.listaOCTeseDeDoutorado = [] self.listaOCDissertacaoDeMestrado = [] self.listaOCMonografiaDeEspecializacao = [] self.listaOCTCC = [] self.listaOCIniciacaoCientifica = [] self.listaOCOutroTipoDeOrientacao = [] # inicializacao self.idLattes = '' self.url = '' self.foto = '' # feed it! # print cvLattesXML #.encode("utf8") self.feed(cvLattesXML)
def topTen(win, feedUrl, background, foreground): feed = urlopen(feedUrl).read() feedTitles = re.findall(r'<title>(.*?)</title>', feed) for index, title in enumerate(feedTitles[2:12]): parser = HTMLParser() text = parser.unescape(title) displayText = "{}. {}".format(index + 1, text) w = Label(win, text=displayText) w.config(bg=background, fg=foreground) w.pack() x = Label(win, text=feedUrl) x.config(bg=background, fg=foreground) x.pack() closure = lambda: save(feedTitles) Button(win, text="Save", command=closure).pack(side="right", expand=True, fill=X)
def run(self, edit): for s in self.view.sel(): if s.empty(): s = self.view.word(s) selected = unicode(self.view.substr(s)) import HTMLParser HTMLParser = HTMLParser.HTMLParser() selected = HTMLParser.unescape(selected) self.view.replace(edit, s, selected);
def top10(url, regex): #Download html code of web page html_code = download_HTML_code(url) #Find relevant data stored in html code _top10_ = findall(regex, html_code) #create a list that stores unicode string data top10_unescaped = [] h = HTMLParser() #Index of list counter = 0 for index in _top10_: counter= counter + 1 top10_unescaped.append(h.unescape(index)) #Dont take more than 10 elements to array if counter == 10: break #return top10 data list return top10_unescaped
def strip_tags(html): from HTMLParser import HTMLParser html=html.strip() html=html.strip("\n") result=[] parse=HTMLParser() parse.handle_data=result.append parse.feed(html) parse.close() return "".join(result)
def strip_tags(self, html): ''' 清洗标签 ''' html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result).strip()
def __init__(self, rawDOIhtml): HTMLParser.__init__(self) self.dadosDaPublicacao = "" self.feed(rawDOIhtml)
u = urllib.urlopen(urlString) #print u.info() lParser.feed(u.read()) lParser.close() #--------------------------------- #this part displays all the links #--------------------------------- import urllib urllib.urlretrieve( 'http://www.comicsalliance.com/2011/03/11/best-art-ever-this-week-3-10-11/', '/tmp/CAmain.htm' ) from htmllib import HTMLParser #htmllib has been deprecated in favor or HTMLParser, HTMLParser has been renamed html.parser from formatter import NullFormatter parser= HTMLParser( NullFormatter( ) ) parser.feed( open( '/tmp/CAmain.htm' ).read( ) ) #import urlparse linecounter=0 for a in parser.anchorlist: # print urlparse.urljoin( 'http://python.org/', a ) if linecounter>105 and linecounter<145: print(a) linecounter+=1 """
def __init__(self): HTMLParser.__init__(self) self.started=True self.bad=False self.text=[] self.urls=[]
import time import logging import random import functools import os import tempfile import commonware.log import lockfile from polib import pofile from django.conf import settings from django.core.cache import get_cache log = commonware.log.getLogger('mdn.devmo.utils') htmlparser = HTMLParser.HTMLParser() def strings_are_translated(strings, locale): # http://stackoverflow.com/a/24339946/571420 pofile_path = os.path.join(settings.ROOT, 'locale', locale, 'LC_MESSAGES', 'messages.po') try: po = pofile(pofile_path) except IOError: # in case the file doesn't exist or couldn't be parsed return False all_strings_translated = True for string in strings: if not any(e for e in po if e.msgid == string and ( e.translated() and 'fuzzy' not in e.flags) and not e.obsolete): all_strings_translated = False
def unescape(self, string): try: pars = HTMLParser.HTMLParser() return pars.unescape(string) except: return string
def __init__(self): self.root = None self.tree = [] HTMLParser.__init__(self)
r4 = re.compile("title=[A-Z][_:a-zA-Z0-9]*") pagename = "Main_Page" if os.environ.has_key('QUERY_STRING'): qs = os.environ['QUERY_STRING'] m = r4.search(qs) if m is not None: pagename = qs[m.start() + 6:m.end()] if DEBUG: print 'pagename=' + pagename url = "http://localhost/wiki/index.php?title=" + pagename + "&action=edit" R = urllib.urlopen(url).read() R = R[r1.search(R).end():r2.search(R).start()] R = HTMLParser.HTMLParser().unescape(R).split('\n') if DEBUG: print R print '==========================' rdfState = 0 for L in R: m = r3.search(L) if rdfState == 0 and m is not None: rdfState = 1 elif rdfState == 1 and L == '<pre>': rdfState = 2 elif L == '</pre>': rdfState = 0 elif rdfState == 2: print L
def clean_tweet(tweet): more_stop_words = ['rt', 'cant','didnt','doesnt','dont','goes','isnt','hes','shes','thats','theres',\ 'theyre','wont','youll','youre','youve', 'br', 've', 're', 'vs', 'goes','isnt',\ 'hes', 'shes','thats','theres','theyre','wont','youll','youre','youve', 'br',\ 've', 're', 'vs', 'this', 'i', 'get','cant','didnt','doesnt','dont','goes','isnt','hes',\ 'shes','thats','theres','theyre','wont','youll','youre','youve', 'br', 've', 're', 'vs'] # start with the initial list and add the additional words to it. stoplist = nltk.corpus.stopwords.words('english') + more_stop_words # define list of codes to be dropped from document # carriage-returns, line-feeds, tabs codelist = ['\r', '\n', '\t'] # insert a space at the beginning and end of the tweet # tweet = ' ' + tweet + ' ' tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet) tweet = re.sub('http[^\\s]+', ' ', tweet) tweet = re.sub(r"\[", '', tweet) tweet = re.sub(r"\]", '', tweet) tweet = re.sub(r"'rt", '', tweet) tweet = re.sub(r'\'', '', tweet) tweet = re.sub(r'\'\,', '', tweet) tweet = re.sub(r'\,\'', '', tweet) tweet = re.sub('rt[^\\s]+', '', tweet) tweet = re.sub(r"' ,", '', tweet) tweet = re.sub(r"\' ,", '', tweet) tweet = re.sub(r", ',',", '', tweet) tweet = re.sub(r"\,", '', tweet) tweet = re.sub(r"\, \"\'\"\,", '', tweet) tweet = re.sub(r"\, \"\' \,\"\,", '', tweet) tweet = re.sub(r"\, \"\'\ \,\"\,", '', tweet) tweet = re.sub(r"\,\ \"\'\"\,", '', tweet) tweet = re.sub(r"\,", '', tweet) tweet = re.sub(r"\"", '', tweet) tweet = re.sub(r"\'", '', tweet) tweet = re.sub(r"\'\,", '', tweet) tweet = re.sub(r'"', '', tweet) tweet = re.sub(",", '', tweet) temp_tweet = re.sub('[^a-zA-Z]', ' ', tweet) # replace non-alphanumeric with space html_parser = HTMLParser.HTMLParser() tweet = html_parser.unescape(tweet) # temp_tweet = re.sub('\d', ' ', temp_tweet) for i in range(len(codelist)): stopstring = ' ' + codelist[i] + ' ' temp_tweet1 = re.sub(stopstring, ' ', temp_tweet) # convert uppercase to lowercase temp_tweet = temp_tweet1.lower() # replace single-character words with space temp_tweet = re.sub('\s.\s', ' ', temp_tweet) # replace selected character strings/stop-words with space for i in range(len(stoplist)): stopstring = ' ' + str(stoplist[i]) + ' ' temp_tweet = re.sub(stopstring, ' ', temp_tweet) # replace multiple blank characters with one blank character temp_tweet = re.sub('\s+', ' ', temp_tweet) return (temp_tweet)
import HTMLParser import sys import re import numpy as np #from New_Utils import * html_parser = HTMLParser.HTMLParser() reload(sys) sys.setdefaultencoding('utf8') #Dictionary for mapping contractions APPOSTOPHES={ "ain't": "is not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he'll've": "he he will have", "he's": "he is",
def getCarnegieGenerator(): """ Generator to return Carnegie Museum of Art paintings """ urls = [] # TODO: Reimplement based on https://collection.cmoa.org/?classification=paintings&page=2&perPage=10&withImage=0 # It's now JSON and contains images # First get the Painting # And let's get the paintings too apiurl = u'http://collection.cmoa.org/CollectionSearch.aspx/GetSearchResults' postjson = u'{"SearchText": "", "Nationality": "Any", "DateRange": "Any", "Classification": "%s", "Theme": "Any", "Department": "Any", "Location": "Any", "WithImages": "false", "WithVideo": "false", "WithAudio": "false", "TeenieHarris": "false", "SortOrder": "alpha-artist", "PageNumber": "%s", "NumberPerPage": "48", "PriorParams": "%s"}' #postjson = u'{"SearchText": "", "Nationality": "Any", "DateRange": "Any", "Classification": "%s", "Theme": "Any", "Department": "Any", "Location": "Any", "WithImages": "false", "WithVideo": "false", "WithAudio": "false", "TeenieHarris": "false", "SortOrder": "alpha-artist", "PageNumber": "%s", "NumberPerPage": "48", "PriorParams": "Y2xhc3NpZmljYXRpb249UGFpbnRpbmd8"}' referer = u'http://collection.cmoa.org/collection-search/' #for classification in [u'Painting', u'paintings']: #firsturl = u'http://collection.cmoa.org/collection-search/' htmlparser = HTMLParser.HTMLParser() session = requests.Session() searchPage = session.get(referer, verify=False) urlregex = u'\<a href\=\"(CollectionDetail\.aspx\?item\=\d+)\&' tosearch = [ #(u'Painting', 12, u'Y2xhc3NpZmljYXRpb249UGFpbnRpbmd8'), # 488, 48 per page (u'paintings', 14, u'Y2xhc3NpZmljYXRpb249cGFpbnRpbmdzfA9999'), # 605, 48 per page ] for (classification, endpage, priorsearch) in tosearch: for i in range(1, endpage): try: searchpage = session.post( apiurl, data=postjson % ( classification, i, priorsearch, ), headers={ 'X-Requested-With': 'XMLHttpRequest', 'referer': referer, u'Content-Type': u'application/json; charset=utf-8' }) except requests.exceptions.ConnectionError: pywikibot.output( u'Could not get the search page. Sleeping and trying again' ) time.sleep(60) searchpage = session.post( apiurl, data=postjson % ( classification, i, priorsearch, ), headers={ 'X-Requested-With': 'XMLHttpRequest', 'referer': referer, u'Content-Type': u'application/json; charset=utf-8' }) print apiurl #print postjson % (classification, i,) print searchpage.text searchjson = searchpage.json() matches = re.finditer(urlregex, searchjson.get(u'd')) for match in matches: metadata = {} url = u'http://collection.cmoa.org/%s' % (match.group(1), ) # Museum site probably doesn't like it when we go fast time.sleep(5) pywikibot.output(url) itempage = requests.get(url) metadata['url'] = url metadata['collectionqid'] = u'Q1043967' metadata['collectionshort'] = u'CMoA' metadata['locationqid'] = u'Q1043967' #No need to check, I'm actually searching for paintings. metadata['instanceofqid'] = u'Q3305213' titlecreatorregex = u'\<div id\=\"detail-data-container\"\>[\s\t\r\n]*\<hgroup class\=\"page-titles\"\>[\s\t\r\n]*\<h1 class\=\"italic\"\>(?P<title>[^\<]+)\<\/h1\>[\s\t\r\n]*\<h2 class\=\"sub1\"\>(?P<name>[^\<]+)\<\/h2\>[\s\t\r\n]*\<h2 class\=\"sub2\"\>(?P<date>[^\<]+)?\<\/h2\>' titlecreatormatch = re.search(titlecreatorregex, itempage.text) title = htmlparser.unescape( titlecreatormatch.group(u'title').strip()) name = htmlparser.unescape( titlecreatormatch.group(u'name').strip()) # Chop chop, in case we have very long titles if len(title) > 220: title = title[0:200] metadata['title'] = { u'en': title, } metadata['creatorname'] = name metadata['description'] = { u'en': u'painting by %s' % (name, ), u'nl': u'schilderij van %s' % (name, ), } if titlecreatormatch.group(u'date'): metadata['inception'] = htmlparser.unescape( titlecreatormatch.group(u'date').strip()) idregex = u'\<span class\=\"label\"\>Accession Number\<\/span\>[\s\t\r\n]*\<span class\=\"value\"\>([^\<]+)\<\/span\>' idmatch = re.search(idregex, itempage.text) metadata['idpid'] = u'P217' metadata['id'] = idmatch.group(1).strip() mediumregex = u'\<span class\=\"label\"\>Medium\<\/span\>[\s\t\r\n]*\<span class\=\"value\"\>oil on canvas\<\/span\>' mediummatch = re.search(mediumregex, itempage.text) if mediummatch: metadata['medium'] = u'oil on canvas' dimensionsregex = u'\<span class\=\"label\"\>Measurements\<\/span\>[\s\t\r\n]*\<span class\=\"value\"\>([^\<]+)\<\/span\>' dimensionsmatch = re.search(dimensionsregex, itempage.text) if dimensionsmatch: dimensiontext = dimensionsmatch.group(1).strip() regex_2d = u'.+\((?P<height>\d+(\.\d+)?) x (?P<width>\d+(\.\d+)?) cm\)$' regex_3d = u'.+\((?P<height>\d+(\.\d+)?) x (?P<width>\d+(\.\d+)?) x (?P<depth>\d+(\.\d+)?) cm\)$' match_2d = re.match(regex_2d, dimensiontext) match_3d = re.match(regex_3d, dimensiontext) if match_2d: metadata['heightcm'] = match_2d.group(u'height') metadata['widthcm'] = match_2d.group(u'width') elif match_3d: metadata['heightcm'] = match_3d.group(u'height') metadata['widthcm'] = match_3d.group(u'width') metadata['depthcm'] = match_3d.group(u'depth') # https://collection.cmoa.org/objects/b1357c35-c930-4e15-9d8a-e50bba6bd03a # https://cmoa-collection-images.s3.amazonaws.com/133250/sizes/1035976-840.jpg # https://cmoa-collection-images.s3.amazonaws.com/133250/1035976.jpg yield metadata
def unescape(self, text): if (sys.version_info[0] < 3): parser = HTMLParser.HTMLParser() else: parser = html.parser.HTMLParser() return (parser.unescape(text))
os.path.join(Addon.getAddonInfo('path'), r'resources', r'lib')) from BeautifulSoup import BeautifulSoup except: try: sys.path.insert( 0, os.path.join(Addon.getAddonInfo('path'), r'resources', r'lib')) from BeautifulSoup import BeautifulSoup except: sys.path.append(os.path.join(os.getcwd(), r'resources', r'lib')) from BeautifulSoup import BeautifulSoup icon = xbmc.translatePath( os.path.join(os.getcwd().replace(';', ''), 'icon.png')) import HTMLParser hpar = HTMLParser.HTMLParser() h = int(sys.argv[1]) def showMessage(heading, message, times=3000): xbmc.executebuiltin('XBMC.Notification("%s", "%s", %s, "%s")' % (heading, message, times, icon)) #---------- parameter/info structure ------------------------------------------- class Param: page = '1' genre = '' genre_name = '' max_page = 0
def _doSearch(self, search_strings, search_mode='eponly', epcount=0, age=0, epObj=None): results = [] items = {'Season': [], 'Episode': [], 'RSS': []} for mode in search_strings.keys(): logger.log(u"Search Mode: %s" % mode, logger.DEBUG) for search_string in search_strings[mode]: self.search_params.update({ 'q': search_string.encode('utf-8'), 'field': ('seeders', 'time_add')[mode == 'RSS'] }) if mode != 'RSS': logger.log(u"Search string: %s" % search_string, logger.DEBUG) try: searchURL = self.urls[('search', 'rss')[ mode == 'RSS']] + '?' + urlencode(self.search_params) logger.log(u"Search URL: %s" % searchURL, logger.DEBUG) data = self.getURL(searchURL) #data = self.getURL(self.urls[('search', 'rss')[mode == 'RSS']], params=self.search_params) if not data: logger.log("No data returned from provider", logger.DEBUG) continue if not data.startswith('<?xml'): logger.log( u'Expected xml but got something else, is your proxy failing?', logger.INFO) continue try: data = xmltodict.parse( HTMLParser.HTMLParser().unescape( data.encode('utf-8')).replace('&', '&')) except ExpatError as e: logger.log( u"Failed parsing provider. Traceback: %r\n%r" % (traceback.format_exc(), data), logger.ERROR) continue if not all([ data, 'rss' in data, 'channel' in data['rss'], 'item' in data['rss']['channel'] ]): logger.log(u"Malformed rss returned, skipping", logger.DEBUG) continue # https://github.com/martinblech/xmltodict/issues/111 entries = data['rss']['channel']['item'] entries = entries if isinstance(entries, list) else [entries] for item in entries: try: title = item['title'].decode('utf-8') # Use the torcache link kat provides, # unless it is not torcache or we are not using blackhole # because we want to use magnets if connecting direct to client # so that proxies work. download_url = item['enclosure']['@url'] if sickbeard.TORRENT_METHOD != "blackhole" or 'torcache' not in download_url: download_url = item['torrent:magnetURI'] seeders = int(item['torrent:seeds']) leechers = int(item['torrent:peers']) verified = bool(int(item['torrent:verified']) or 0) size = int(item['torrent:contentLength']) info_hash = item['torrent:infoHash'] #link = item['link'] except (AttributeError, TypeError, KeyError): continue try: pubdate = datetime.datetime.strptime( item['pubDate'], '%a, %d %b %Y %H:%M:%S +0000') except Exception: pubdate = datetime.datetime.today() if not all([title, download_url]): continue #Filter unseeded torrent if seeders < self.minseed or leechers < self.minleech: if mode != 'RSS': logger.log( u"Discarding torrent because it doesn't meet the minimum seeders or leechers: {0} (S:{1} L:{2})" .format(title, seeders, leechers), logger.DEBUG) continue if self.confirmed and not verified: if mode != 'RSS': logger.log( u"Found result " + title + " but that doesn't seem like a verified result so I'm ignoring it", logger.DEBUG) continue item = title, download_url, size, seeders, leechers if mode != 'RSS': logger.log(u"Found result: %s " % title, logger.DEBUG) items[mode].append(item) except Exception: logger.log( u"Failed parsing provider. Traceback: %r" % traceback.format_exc(), logger.ERROR) #For each search mode sort all the items by seeders if available items[mode].sort(key=lambda tup: tup[3], reverse=True) results += items[mode] return results
import HTMLParser import itertools import random import string import time import requests from pylons import app_globals as g from r2.lib.db import queries from r2.lib import amqp from r2.lib.utils import weighted_lottery, get_requests_resp_json from r2.models import Account, NotFound, register, Subreddit, Link, Comment unescape_htmlentities = HTMLParser.HTMLParser().unescape class TextGenerator(object): """A Markov Chain based text mimicker.""" def __init__(self, order=8): self.order = order self.starts = collections.Counter() self.start_lengths = collections.defaultdict(collections.Counter) self.models = [ collections.defaultdict(collections.Counter) for i in xrange(self.order) ] @staticmethod def _in_groups(input_iterable, n):
def clean_content(content): soup= BeautifulSoup(content) content = soup.text.strip() h = HTMLParser.HTMLParser() return h.unescape(content)
def replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = HTMLParser.HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") return txt
def __init__(self): HTMLParser.__init__(self) self.recording = 0 self.data = []
def replaceEscapeCodes(txt): txt = HTMLParser.HTMLParser().unescape(txt) return txt
# Outputs JSON to STDOUT. Run and save with: # ./run structure > structure.json # # options: # year: "uscprelim" (the default), or a specific year version of the Code (e.g. "2011") # title: Do only a specific title (e.g. "5", "5a", "25") # sections: Return a flat hierarchy of only titles and sections (no intervening layers) # debug: Output debug messages only, and no JSON output (dry run) # force: Force a re-download of the US Code for the given year (script defaults to caching if the directory for a year is present) import glob, re, lxml.html, json, sys, os import utils import HTMLParser pars = HTMLParser.HTMLParser() section_symbol = u'\xa7' def run(options): year = options.get("year", "uscprelim") # default to USCprelim # optional: don't print json out, just --debug information debug = options.get('debug', False) # optional: limit to a specific --title title = options.get("title", None) if title: # appendix cites may look like "5a" but we need "05a" to match the file if title.endswith("a"):
from functools import partial from future_builtins import filter, map, zip from multiprocessing.pool import ThreadPool from xml.sax.saxutils import escape, quoteattr # }}} USER_AGENT = 'calibre mirror' MR_URL = 'https://www.mobileread.com/forums/' IS_PRODUCTION = os.path.exists('/srv/plugins') WORKDIR = '/srv/plugins' if IS_PRODUCTION else '/t/plugins' PLUGINS = 'plugins.json.bz2' INDEX = MR_URL + 'showpost.php?p=1362767&postcount=1' # INDEX = 'file:///t/raw.html' IndexEntry = namedtuple('IndexEntry', 'name url donate history uninstall deprecated thread_id') u = HTMLParser.HTMLParser().unescape socket.setdefaulttimeout(30) def read(url, get_info=False): # {{{ if url.startswith("file://"): return urllib2.urlopen(url).read() opener = urllib2.build_opener() opener.addheaders = [ ('User-Agent', USER_AGENT), ('Accept-Encoding', 'gzip,deflate'), ] # Sporadic network failures in rackspace, so retry with random sleeps for i in range(10): try:
def filter_cases(request, domain, app_id, module_id): app = Application.get(app_id) module = app.get_module(module_id) delegation = request.GET.get('task-list') == 'true' auth_cookie = request.COOKIES.get('sessionid') suite_gen = SuiteGenerator(app) xpath = suite_gen.get_filter_xpath(module, delegation=delegation) extra_instances = [{ 'id': inst.id, 'src': inst.src } for inst in suite_gen.get_extra_instances(module)] # touchforms doesn't like this to be escaped xpath = HTMLParser.HTMLParser().unescape(xpath) if delegation: case_type = DELEGATION_STUB_CASE_TYPE else: case_type = module.case_type if xpath: # if we need to do a custom filter, send it to touchforms for processing additional_filters = { "properties/case_type": case_type, "footprint": True } helper = SessionDataHelper(domain, request.couch_user) result = helper.filter_cases(xpath, additional_filters, DjangoAuth(auth_cookie), extra_instances=extra_instances) if result.get('status', None) == 'error': return HttpResponseServerError( result.get("message", _("Something went wrong filtering your cases."))) case_ids = result.get("cases", []) else: # otherwise just use our built in api with the defaults case_ids = [ res.id for res in get_filtered_cases(domain, status=CASE_STATUS_OPEN, case_type=case_type, user_id=request.couch_user._id, ids_only=True) ] cases = [ CommCareCase.wrap(doc) for doc in iter_docs(CommCareCase.get_db(), case_ids) ] # refilter these because we might have accidentally included footprint cases # in the results from touchforms. this is a little hacky but the easiest # (quick) workaround. should be revisted when we optimize the case list. cases = filter(lambda c: c.type == case_type, cases) cases = [c.get_json(lite=True) for c in cases if c] parents = [] if delegation: for case in cases: parent_id = case['indices']['parent']['case_id'] parents.append(CommCareCase.get(parent_id)) return json_response({'cases': cases, 'parents': parents}) else: return json_response(cases)
# encoding: utf-8 import tweepy #https://github.com/tweepy/tweepy import csv import string import markovify import shelve import random import time import HTMLParser from datetime import datetime # READ THE README.MD! import credentials as creds h = HTMLParser.HTMLParser() #Twitter API credentials consumer_key = creds.consumer_key consumer_secret = creds.consumer_secret access_key = creds.access_key access_secret = creds.access_secret done_ids = shelve.open('parsed_ids') #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth)
import difflib import SendEmail import re import HTMLParser current_file = open("Adoptable Animals.html", "r") new_file = open("Adoptable Animals Mod.html", "r") email = "" for line in difflib.unified_diff(str(current_file.readlines()), str(new_file.readlines())): if not re.match(r'-', line): email += str(line.strip('+-!')) name_list = HTMLParser.parse_html(email) SendEmail.authorize_and_send_message(str(name_list)) print(name_list)
import requests import xbmcgui import urllib2 import urllib import time import json import xbmc import sys import re _handle = int(sys.argv[1]) addon_id = 'plugin.video.cerebrozstrictlyhd' addon = xbmcaddon.Addon(id=addon_id) base = 'http://moviegrabber.tv' query_url = 'http://moviegrabber.tv/searchaskforapi/?id=%s' regulate = HTMLParser.HTMLParser() def RESOLVE(link): try: play_link = urlresolver.HostedMediaFile(link).resolve() except: play_link = link if 'False' in str(play_link): play_link = link play_item = xbmcgui.ListItem(path=play_link) play_item.setProperty('IsPlayable', 'true') xbmcplugin.setResolvedUrl(_handle, True, listitem=play_item) def cleanHex(s):
f = open('facebooktext.txt', 'w') f.write(html) f.close() matches = re.findall(r"<p>([a-zA-Z0-9 !-)(?.,;:'\"_+= -]*)", open('facebooktext.txt', 'r').read()) os.remove('C:\Python27\mockingbird\\facebooktext.txt') with open('facebook_dictionary.json', 'r') as fb: fb_dictionary = json.load(fb) unknown = open('unknown_facebook_words', 'a') f = open('facebookwords.txt', 'a') for match in matches: stripped_text = HTMLParser.HTMLParser().unescape(match) if stripped_text != '': f.write(stripped_text + ' xYx ') words = stripped_text.split() for word in words: word = str(word).lower().translate(None, '",!.?!@#$%^&*()_-:<>') POS = mapPOS(word) if POS: for char in list(POS): if char in fb_dictionary.keys(): fb_dictionary[char].append(word) else: fb_dictionary[char] = [] else: unknown.write(word + ', ')
def __init__(self): HTMLParser.__init__(self) self.links = []
def getPaintingGenerator(query=u''): ''' Doing a two step approach here. Could do one, but would be complicated * Loop over http://art.famsf.org/search?search_api_views_fulltext=&f[0]=field_art_class%3A684&page=0 - 20 and grab paintings * Grab data from paintings ''' # LOL, these nerds start at zero :-) baseurl = u'http://art.famsf.org/search?search_api_views_fulltext=&f[0]=field_art_class%%3A684&page=%s' htmlparser = HTMLParser.HTMLParser() # Have to restart in the end to do the 9 # 0 - 20 for i in range(8, 12): searchurl = baseurl % (i, ) print searchurl searchPage = urllib2.urlopen(searchurl) searchData = searchPage.read() # <div class="views-field views-field-title"> <span class="field-content"><a href="/nicolas-maes/portrait-man-19425">Portrait of a Man</a> itemregex = u'<div class="views-field views-field-title">\s*<span class="field-content"><a href="([^"]+)">([^<]+)</a>' for match in re.finditer(itemregex, searchData): url = u'http://art.famsf.org%s' % (match.group(1), ) title = htmlparser.unescape(unicode(match.group(2), "utf-8")) print url itemPage = urllib2.urlopen(url) itemData = itemPage.read() metadata = {} metadata['url'] = url metadata['title'] = title creatorregex = u'<span class="views-label">Artist: </span>\s*<div class="field-content"><a href="([^"]+)">([^<]+)</a></div>' dateeregex = u'<div class="views-field views-field-field-art-display-date">\s*<span class="views-label views-label-field-art-display-date">Date:\s*</span>\s*<div class="field-content">([^<]+)</div>\s*</div>' locationegex = u'<div class="views-field views-field-field-art-location-calculated">\s*<span class="views-label views-label-field-art-location-calculated">Location:\s*</span>\s*<div class="field-content">([^<]+)</div>\s*</div>' mediumregex = u'<div class="views-field views-field-field-art-media">\s*<span class="views-label views-label-field-art-media">Media:\s*</span>\s*<div class="field-content">([^<]+)</div>\s*</div>' idregex = u'<div class="views-field views-field-field-art-accession-number">\s*<span class="views-label views-label-field-art-accession-number">Accession Number:\s*</span>\s*<div class="field-content">([^<]+)</div>\s*</div>' acquisitiondateregex = u'<div class="views-field views-field-field-art-acquisition-date">\s*<span class="views-label views-label-field-art-acquisition-date">Acquisition Date:\s*</span>\s*<div class="field-content"><span class="date-display-single">(\d\d\d\d-\d\d-\d\d)</span></div>\s*</div>' creatormatch = re.search(creatorregex, itemData, flags=re.M) if creatormatch: metadata[u'creator'] = htmlparser.unescape( unicode(creatormatch.group(2), "utf-8")) else: # Creator not always available metadata[u'creator'] = u'anonymous' #titlematch = re.search(titleregex, itemData) #metadata[u'title']=htmlparser.unescape(unicode(titlematch.group(1), "utf-8")) locationmatch = re.search(locationegex, itemData) if locationmatch.group(1) == u'de Young': # Ok, it's on view at de Young so let's add that metadata[u'location'] = u'Q1470276' elif locationmatch.group(1) == u'Legion of Honor': # Ok, it's on view at the Legion of Honor so let's add that metadata[u'location'] = u'Q2468251' else: # Where? Let's add the main museum metadata[u'location'] = u'Q1416890' datematch = re.search(dateeregex, itemData) # Not always available if datematch: metadata[u'date'] = htmlparser.unescape( unicode(datematch.group(1), "utf-8")) mediummatch = re.search(mediumregex, itemData) # Not always available if mediummatch: metadata[u'medium'] = htmlparser.unescape( unicode(mediummatch.group(1), "utf-8")) idmatch = re.search(idregex, itemData) metadata[u'id'] = htmlparser.unescape( unicode(idmatch.group(1), "utf-8")) if u'?' in metadata[u'id']: continue acquisitiondatematch = re.search(acquisitiondateregex, itemData) if acquisitiondatematch: metadata[u'acquisitiondate'] = htmlparser.unescape( unicode(acquisitiondatematch.group(1), "utf-8")) yield metadata '''
def __init__(self): HTMLParser.__init__(self) self.article = Article("","")
这里还有一个例外情况: exception HTMLParser.HTMLParserError 当分析遇到 Error 时 HTMLParser 会抛出异常。该异常提供三个属性: msg , lineno and offset 。 HTMLParser 实例有如下的方法: HTMLParser.reset() 重置实例 . 所有未处理的数据都会丢失。在初始化时自动调用。 HTMLParser.feed(data) 给分析器喂食。在由完整元素构成的情况下工作;不完整数据情况下,会进行缓冲知道更多数据加进来或者 close() 被调用。 HTMLParser.close() 处理所有缓冲数据。这个方法可以被派生类重定义,以便在输入结束后处理额外的事情,重定义的版本也要调用 HTMLParser 基类的 close() 方法。
def normalizeTextForTagger(text): text = text.replace("&", "&") text = HTMLParser.HTMLParser().unescape(text) return text
def __init__(self, projectDirectory): HTMLParser.__init__(self) self.projectDirectory = Sourcerer.cleanDirecoryPath(projectDirectory)
def html_type_repl(matchobj): return HTMLParser.HTMLParser().unescape(matchobj.group())
def reset(self): HTMLParser.reset(self)
def getTePapaGenerator(): """ Generator to return Museum of New Zealand Te Papa Tongarewa paintings """ htmlparser = HTMLParser.HTMLParser() count = 12 basesearchurl = u'http://collections.tepapa.govt.nz/Search/GetObjectThumbnailsShowMoreForAdvanced/?scope=all&imagesOnly=False&downloadable=False&startIndex=%s&returnCount=%s&advanced=colClassification:"paintings"+colCollectionGroup:CH' for i in range(1, 1793, count): searchurl = basesearchurl % (i, count) searchPage = requests.get(searchurl) for iteminfo in searchPage.json(): metadata = {} url = u'http://collections.tepapa.govt.nz%s' % ( iteminfo.get('path'), ) # Museum site probably doesn't like it when we go fast # time.sleep(5) pywikibot.output(url) metadata['url'] = url metadata['collectionqid'] = u'Q915603' metadata['collectionshort'] = u'Te Papa' metadata['locationqid'] = u'Q915603' #No need to check, I'm actually searching for paintings. metadata['instanceofqid'] = u'Q3305213' title = iteminfo.get('title') # Chop chop, in case we have very long titles if len(title) > 220: title = title[0:200] metadata['title'] = { u'en': title, } name = iteminfo.get('colProProductionMakers') if not name: metadata['creatorqid'] = u'Q4233718' metadata['creatorname'] = u'anonymous' metadata['description'] = { u'nl': u'schilderij van anonieme schilder', u'en': u'painting by anonymous painter', } else: if u',' in name: (surname, sep, firstname) = name.partition(u',') name = u'%s %s' % ( firstname.strip(), surname.strip(), ) metadata['creatorname'] = name metadata['description'] = { u'nl': u'%s van %s' % ( u'schilderij', metadata.get('creatorname'), ), u'en': u'%s by %s' % ( u'painting', metadata.get('creatorname'), ), } metadata['inception'] = iteminfo.get('colProProductionDates') metadata['idpid'] = u'P217' metadata['id'] = iteminfo.get('colRegistrationNumber') # Not everything is in json, so some good old parsing itempage = requests.get(url) mediumregex = u'\<td class\=\"heading\"\>Medium summary\<\/td\>[\s\t\r\n]*\<td\>oil on canvas\<\/td\>' mediummatch = re.search(mediumregex, itempage.text) if mediummatch: metadata['medium'] = u'oil on canvas' dimensionsregex = u'\<td class\=\"heading\"\>Dimensions\<\/td\>[\s\t\r\n]*\<td\>[\s\t\r\n]*(Overall|Image):([^\<]+)[\s\t\r\n]*\<br \/\>' dimensionsmatch = re.search(dimensionsregex, itempage.text) if dimensionsmatch: dimensiontext = dimensionsmatch.group(2).strip() regex_2d = u'(?P<height>\d+)(mm)?\s*\(Height\)[\s\t\r\n]*x[\s\t\r\n]*(?P<width>\d+)(mm)?\s*\((Width|Length)\).*$' regex_3d = u'(?P<height>\d+)(mm)?\s*\(Height\)[\s\t\r\n]*x[\s\t\r\n]*(?P<width>\d+)(mm)?\s*\((Width|Length)\)[\s\t\r\n]*x[\s\t\r\n]*(?P<depth>\d+)(mm)?\s*\(Depth\).*$' match_2d = re.match(regex_2d, dimensiontext) match_3d = re.match(regex_3d, dimensiontext) if match_2d: metadata['heightcm'] = unicode( float(match_2d.group(u'height')) / 10) metadata['widthcm'] = unicode( float(match_2d.group(u'width')) / 10) elif match_3d: metadata['heightcm'] = unicode( float(match_3d.group(u'height')) / 10) metadata['widthcm'] = unicode( float(match_3d.group(u'width')) / 10) metadata['depthcm'] = unicode( float(match_3d.group(u'depth')) / 10) creditlineregex = u'\<td class\=\"heading\"\>Credit line\<\/td\>[\s\t\r\n]*\<td\>([^\<]+ (?P<year1>\d\d\d\d)|Purchased (?P<year2>\d\d\d\d) [^\<]+)\<\/td\>' creditlinematch = re.search(creditlineregex, itempage.text) if creditlinematch: if creditlinematch.group(u'year1'): metadata['acquisitiondate'] = creditlinematch.group( u'year1') elif creditlinematch.group(u'year2'): metadata['acquisitiondate'] = creditlinematch.group( u'year2') yield metadata
def _clean_text(self, text): text = HTMLParser.HTMLParser().unescape(text) text = re.sub('[\r\n\t]', '', text) text = re.sub('>\s+<', '><', text) return re.sub('\s+', ' ', text).strip()
def process_layout(layout_schema=None, interactions=None): # Load template and find 'body' for template appendation env = Environment() env.loader = FileSystemLoader(PORTAL_ROOT + '/templates') tmpl_unparsed = env.get_template('ion_ux.html').render() tmpl = ET.fromstring(tmpl_unparsed.encode('utf-8')) body_elmt = tmpl.find('body') # Fetch the layout schema layout_schema = LayoutApi.get_new_layout_schema() # Track resource types, metadata and widgets without processed sub-attributes resource_types = [] metadata_processed = [] exclude_sub_attributes = ['table_ooi', 'chart_ooi'] attribute_levels = [ 'level-zero', 'level-one', 'level-two', 'level-three', 'level-four', 'level-five', 'level-six' ] # -------------------------------------------------------------------------- # VIEWS # -------------------------------------------------------------------------- # Loop through defined views and build <script> templates with following heirarchy: # view -> groups -> blocks -> attributes -> sub-attributes. for view_id in DEFINED_VIEWS: view = layout_schema['spec']['elements'][view_id] script_elmt = _make_element(body_elmt, 'script', id=view_id, type='text/template') # heading_elmt = _make_element(script_elmt, 'div', css='row-fluid heading') v00_elmt = _make_element(script_elmt, 'div', css='v00 heading') content_elmt = _make_element(script_elmt, 'div', css='row-fluid') v01_elmt = _make_element(content_elmt, 'div', css='v01 span3') v02_elmt = _make_element(content_elmt, 'div', css='v02 span9') # -------------------------------------------------------------------------- # GROUPS # -------------------------------------------------------------------------- # Track groups on per view basis groups = {} # Loop through groups for gr_idx, gr_element in enumerate(view['embed']): group_elid = gr_element['elid'] group_link_id = group_elid + str(randint(0, 1000)) group_position = gr_element['pos'] group = layout_schema['spec']['elements'][group_elid] # Set the parent element for the group if group_position == 'V00': parent_elmt = v00_elmt elif group_position == 'V01': parent_elmt = v01_elmt else: parent_elmt = v02_elmt # LABEL OVERRIDES if gr_element.has_key('olabel'): print 'group label override:', group[ 'label'], '->', gr_element['olabel'], group_elid group_label = gr_element['olabel'] else: group_label = group['label'] # CHECK FOR TITLE BAR (V00), creates tabs for V01 and V02 groups if group_position == 'V00': group_elmt = parent_elmt else: if not group_position in groups.keys(): group_container_elmt = _make_element(parent_elmt, 'div', id=group_elid, css='group') group_ul_elmt = _make_element(group_container_elmt, 'ul', css='nav nav-tabs') group_block_container_elmt = _make_element( group_container_elmt, 'div', css='tab-content') groups.update({ group_position: { 'ul_elmt': group_ul_elmt, 'group_container_elmt': group_container_elmt, 'group_block_container_elmt': group_block_container_elmt } }) else: group_ul_elmt = groups[group_position]['ul_elmt'] group_block_container_elmt = groups[group_position][ 'group_block_container_elmt'] # <li>, <a> and group element group_li_elmt = _make_element(group_ul_elmt, 'li', css='') group_a_elmt = _make_element(group_li_elmt, 'a', href="#%s" % group_link_id, data_toggle='tab', content=group_label) group_elmt = _make_element(group_block_container_elmt, 'div', id=group_link_id, css='tab-pane row-fluid') # -------------------------------------------------------------------------- # BLOCKS # -------------------------------------------------------------------------- # Loop through blocks for bl_element in group['embed']: block_elid = bl_element['elid'] block_position = bl_element['pos'] block = layout_schema['spec']['elements'][block_elid] block_widget_id = block['wid'] block_widget = layout_schema['spec']['widgets'][ block_widget_id] block_widget_type = block_widget['name'] block_res_type = block['ie']['ie_name'] if block.has_key( 'ie') else '' if not block_res_type in resource_types: resource_types.append(block_res_type) # Set li class based on block_res_type if group_position != 'V00': li_css_class = group_li_elmt.get('class') if not block_res_type in li_css_class: li_css_class += ' %s' % block_res_type group_li_elmt.attrib['class'] = li_css_class # LABEL OVERRIDES if bl_element.has_key('olabel'): print 'block label override:', block[ 'label'], '->', bl_element['olabel'], block_elid block_label = bl_element['olabel'] else: block_label = block['label'] block_css_class = block_res_type # if not block_res_type in block_css_class: # block_css_class += ' %s' % block_res_type # BLOCK LAYOUT if block['embed']: for at_element in block['embed']: attribute = layout_schema['spec']['elements'][ at_element['elid']] attribute_widget_type = layout_schema['spec'][ 'widgets'][attribute['wid']]['name'] wide_container = True if attribute_widget_type in ( 'table_ooi', 'chart_ooi') else False if wide_container: block_container = _make_element(group_elmt, 'div', css='row-fluid') block_elmt = _make_element(block_container, 'div', style="display:none;", id=block_elid) block_css_class += ' span12' else: block_elmt = _make_element(group_elmt, 'div', style="display:none;", id=block_elid) block_css_class += ' block' # Greater than V01 if group_position not in ('V00', 'V01'): block_css_class += ' span3' # CHECK FOR TITLE BAR (V00) elif group_position == 'V00': block_css_class += ' row-fluid' block_elmt.attrib['class'] = block_css_class # SET GROUP HEADINGS if group_position != 'V00': # Hide table headers for now. if not attribute_widget_type == 'table_ooi': block_h3_elmt = _make_element(block_elmt, 'h3', content=block_label) if group_position == 'V00': block_container_elmt = block_elmt left_elmt = _make_element(block_container_elmt, 'div', css='span6 heading-left') right_elmt = _make_element(block_container_elmt, 'div', css='span6 heading-right') else: block_container_elmt = _make_element(block_elmt, 'div') # Attributes for at_element in block['embed']: attribute_elid = at_element['elid'] attribute_position = at_element['pos'] attribute_data_path = at_element['dpath'] attribute_level = at_element['olevel'] attribute_css = attribute_levels[int( attribute_level)] if attribute_level else '' attribute = layout_schema['spec']['elements'][ attribute_elid] attribute_widget_id = attribute['wid'] attribute_widget_type = layout_schema['spec'][ 'widgets'][attribute_widget_id]['name'] # LABEL OVERRIDES if at_element.has_key('olabel'): print 'attribute label override:', attribute[ 'label'], '->', at_element[ 'olabel'], attribute_elid attribute_label = at_element['olabel'] else: attribute_label = attribute['label'] if attribute_widget_type == 'image_ooi': image_class = layout_schema['spec']['graphics'][ attribute['gfx']]['name'] attribute_css += ' %s %s' % (attribute_widget_type, image_class) else: attribute_css += ' %s' % attribute_widget_type # CHECK FOR TITLE BAR if attribute_widget_type not in ( 'table_ooi', 'chart_ooi') and group_position != 'V00': block_container_elmt.set('class', 'content-wrapper') attribute_options = { 'id': attribute_elid, 'data-position': attribute_position, 'data-path': attribute_data_path, 'data-level': attribute_level, 'data-label': attribute_label, 'css': attribute_css } if group_position == 'V00': if attribute_position == 'B01' or attribute_position == 'B02': attribute_elmt = _make_element( left_elmt, 'div', **attribute_options) else: attribute_elmt = _make_element( right_elmt, 'div', **attribute_options) else: attribute_elmt = _make_element( block_container_elmt, 'div', **attribute_options) # FOR INTEGRATION # if UI_MODE == 'DEVELOPMENT': # attribute_elmt.text = 'Attribute: %s (%s) (%s) (%s) (%s)' % (attribute['label'], attribute['name'], attribute_elid, attribute_widget_type, attribute_position) # Generate metadata for nested elements, ex. tables and attribute groups if attribute_widget_type in ( 'table_ooi', 'attribute_group_ooi' ) and attribute_elid not in metadata_processed: metadata_processed.append(attribute_elid) metadata = [] for embedded_attribute in attribute['embed']: embedded_object = layout_schema['spec'][ 'elements'][embedded_attribute['elid']] embedded_widget_type = layout_schema['spec'][ 'widgets'][ embedded_attribute['wid']]['name'] # LABEL OVERRIDE if embedded_attribute.has_key('olabel'): print 'sub-attribute label override:', embedded_object[ 'label'], '->', embedded_attribute[ 'olabel'], attribute_elid embedded_object_label = embedded_attribute[ 'olabel'] else: embedded_object_label = embedded_object[ 'label'] embedded_info_level = embedded_attribute[ 'olevel'] if embedded_info_level: embedded_info_level_index = int( embedded_info_level) metadata_items = [ embedded_widget_type, embedded_object_label, embedded_attribute['dpath'], embedded_attribute['pos'], embedded_info_level, attribute_levels[embedded_info_level_index] ] if attribute_widget_type == 'attribute_group_ooi': meta_elmt_id = 'ATTRIBUTE_GROUP_' + attribute_elid metadata_items.append( embedded_attribute['elid']) metadata_items.append( embedded_attribute['dpath']) elif attribute_widget_type == 'table_ooi': meta_elmt_id = 'TABLE_' + attribute_elid metadata.append(metadata_items) # Append metadata to body as a JSON script meta_elmt = ET.SubElement(body_elmt, 'script') meta_elmt.set('id', meta_elmt_id) meta_elmt.text = "var %s=%s" % ( meta_elmt_id, json.dumps(metadata)) layout_elmt = ET.SubElement(body_elmt, 'script') layout_elmt.set('id', 'layout') layout_elmt.text = "var LAYOUT=%s;" % json.dumps(layout_schema) resource_types_elmt = ET.SubElement(body_elmt, 'script') resource_types_elmt.set('id', 'resource_types') resource_types_elmt.text = "var RESOURCE_TYPES=%s" % json.dumps( resource_types) init_script_elmt = ET.Element('script') init_script_elmt.set('type', 'text/javascript') init_script_elmt.text = "$(function(){initialize_app();});" body_elmt.append(init_script_elmt) tmpl = ET.tostring(tmpl) tmpl = '<!DOCTYPE html>\n' + tmpl h = HTMLParser.HTMLParser() return h.unescape(tmpl)
def __init__(self): HTMLParser.__init__(self) # http://stackoverflow.com/a/9698750 self.start_title=0 self.title = '' self.stop_title=0
def unescape(html): global html_parser if not html_parser: html_parser = HTMLParser.HTMLParser() return html_parser.unescape(html)
def __init__(self, idMembro, cvLattesHTML): HTMLParser.__init__(self) # inicializacao obrigatoria self.idMembro = idMembro self.sexo = 'Masculino' self.nomeCompleto = u'[Nome-nao-identificado]' self.item = '' self.issn = '' self.listaIDLattesColaboradores = [] self.listaFormacaoAcademica = [] self.listaProjetoDePesquisa = [] self.listaAreaDeAtuacao = [] self.listaIdioma = [] self.listaPremioOuTitulo = [] self.listaArtigoEmPeriodico = [] self.listaLivroPublicado = [] self.listaCapituloDeLivroPublicado = [] self.listaTextoEmJornalDeNoticia = [] self.listaTrabalhoCompletoEmCongresso = [] self.listaResumoExpandidoEmCongresso = [] self.listaResumoEmCongresso = [] self.listaArtigoAceito = [] self.listaApresentacaoDeTrabalho = [] self.listaOutroTipoDeProducaoBibliografica = [] self.listaSoftwareComPatente = [] self.listaSoftwareSemPatente = [] self.listaProdutoTecnologico = [] self.listaProcessoOuTecnica = [] self.listaTrabalhoTecnico = [] self.listaOutroTipoDeProducaoTecnica = [] self.listaPatente = [] self.listaProgramaComputador = [] self.listaDesenhoIndustrial = [] self.listaProducaoArtistica = [] self.listaOASupervisaoDePosDoutorado = [] self.listaOATeseDeDoutorado = [] self.listaOADissertacaoDeMestrado = [] self.listaOAMonografiaDeEspecializacao = [] self.listaOATCC = [] self.listaOAIniciacaoCientifica = [] self.listaOAOutroTipoDeOrientacao = [] self.listaOCSupervisaoDePosDoutorado = [] self.listaOCTeseDeDoutorado = [] self.listaOCDissertacaoDeMestrado = [] self.listaOCMonografiaDeEspecializacao = [] self.listaOCTCC = [] self.listaOCIniciacaoCientifica = [] self.listaOCOutroTipoDeOrientacao = [] self.listaParticipacaoEmEvento = [] self.listaOrganizacaoDeEvento = [] # inicializacao para evitar a busca exaustiva de algumas palavras-chave self.salvarAtualizacaoCV = 1 self.salvarFoto = 1 self.procurarCabecalho = 0 self.achouGrupo = 0 self.doi = '' self.relevante = 0 self.idOrientando = '' self.complemento = '' # contornamos alguns erros do HTML da Plataforma Lattes cvLattesHTML = cvLattesHTML.replace("<![CDATA[","") cvLattesHTML = cvLattesHTML.replace("]]>","") cvLattesHTML = cvLattesHTML.replace("<x<","<x<") cvLattesHTML = cvLattesHTML.replace("<X<","<X<") # feed it! try: cvLattesHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities':1}) except UnicodeDecodeError, e: # For some reason, pytidylib fails to decode, whereas the # original html content converts perfectly manually. print e cvLattesHTML, errors = tidy_document(cvLattesHTML.encode('utf-8'), options={'numeric-entities':1}) document = document.decode('utf-8')
def push_message(request): if request.method != "POST": raise Http404 title = request.POST.get('title', None) body = request.POST.get('body', None) url_args = request.POST.get('url_args', '') account_key = request.POST.get('account_key', None) account_keys = request.POST.getlist("account_keys", None) scheduled_at = request.POST.get('scheduled_at', None) if not scheduled_at or len(scheduled_at) == 0: scheduled_at = None segments = request.POST.getlist('send_to_segments', None) if not segments or len(segments) == 0: segments = None segments_string = request.POST.get('send_to_segments_string', None) if segments_string: temp_segments = segments_string.split(",") if len(temp_segments): segments = temp_segments if not title: raise Exception("Submitted title is empty. Body: " + body) if not body: raise Exception("Submitted body is empty. Title: " + title) if not account_key and not account_keys: raise Exception("Submitted Account Key is empty. Title: " + title) if scheduled_at: scheduled_at = datetime.strptime(scheduled_at, '%m/%d/%Y %H:%M %p') custom = request.POST.get('custom', False) if custom: custom = True image = request.FILES.get('image', None) h = HTMLParser.HTMLParser() title = h.unescape(title) title = title.encode('utf-8', 'ignore').strip(' \n\r') truncate_title = lambda data: len(data) > 40 and data[:40] + '...' or data title = truncate_title(title) body = h.unescape(body) body = body.encode('utf-8', 'ignore').strip(' \n\r') truncate_body = lambda data: len(data) > 100 and data[:100] + '...' or data body = truncate_body(body) should_push = False comment = '' command_path = settings.SUBPROCESS_COMMAND_PATH if account_key: try: profile = ClientProfile.objects.get(account_key=account_key, status='active') try: plan = Plan.objects.exclude(type=plans.NONE).exclude( status='expired').filter( user=profile.user, status='active').latest('created_at') sent_notifications = PushMessage.objects.sent_notifications_count( account_key=account_key) should_push = True if sent_notifications >= plan.number_of_notifications: should_push = False comment = 'Notifications number for plan exceeded.' except Plan.DoesNotExist: comment = 'No price plan for user_id: ' + str(profile.user.id) except ClientProfile.DoesNotExist: comment = 'No user for this account key or profile is not active.' if not should_push: try: website = Website.objects.get(account_key=account_key) comment = '' should_push = True except Website.DoesNotExist: comment = 'No user for this account key or profile is not active or no website cluster.' new_message = PushMessage.objects.create(title=title, body=body, url_args=url_args, account_key=account_key, custom=custom, comment=comment, scheduled_at=scheduled_at, image=image) if segments: for segment in Segment.objects.filter(id__in=segments): new_message.segments.add(segment) new_message.save() if should_push and scheduled_at: should_push = False if should_push: # subprocess for async execution subprocess.Popen("sleep 10; python " + command_path + " " + str(new_message.id), shell=True) elif account_keys: profiles = ClientProfile.objects.filter(account_key__in=account_keys, status='active') print(profiles) for p in profiles: notif = PushMessage.objects.create(title=title, body=body, url_args=url_args, account_key=p.account_key, custom=custom, comment=comment, scheduled_at=scheduled_at, image=image) print(notif) print(notif.id) if segments: for segment in Segment.objects.filter(id__in=segments): notif.segments.add(segment) notif.save() if not scheduled_at: subprocess.Popen("sleep 10; python " + command_path + " " + str(notif.id), shell=True) websites = Website.objects.filter(account_key__in=account_keys) for w in websites: notif = PushMessage.objects.create(title=title, body=body, url_args=url_args, account_key=w.account_key, custom=custom, comment=comment, scheduled_at=scheduled_at, image=image) if segments: for segment in Segment.objects.filter(id__in=segments): notif.segments.add(segment) notif.save() if not scheduled_at: subprocess.Popen("sleep 10; python " + command_path + " " + str(notif.id), shell=True) return render_to_response('pushmonkey/pushed.html')
def getRegexParsed( regexs, url, cookieJar=None, forCookieJarOnly=False, recursiveCall=False, cachedPages={}, rawPost=False, cookie_jar_file=None): #0,1,2 = URL, regexOnly, CookieJarOnly #cachedPages = {} #print 'url',url doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url) # print 'doRegexs',doRegexs,regexs setresolved = True for k in doRegexs: if k in regexs: #print 'processing ' ,k m = regexs[k] #print m cookieJarParam = False if 'cookiejar' in m: # so either create or reuse existing jar #print 'cookiejar exists',m['cookiejar'] cookieJarParam = m['cookiejar'] if '$doregex' in cookieJarParam: cookieJar = getRegexParsed(regexs, m['cookiejar'], cookieJar, True, True, cachedPages) cookieJarParam = True else: cookieJarParam = True #print 'm[cookiejar]',m['cookiejar'],cookieJar if cookieJarParam: if cookieJar == None: #print 'create cookie jar' cookie_jar_file = None if 'open[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split( 'open[')[1].split(']')[0] # print 'cookieJar from file name',cookie_jar_file cookieJar = getCookieJar(cookie_jar_file) # print 'cookieJar from file',cookieJar if cookie_jar_file: saveCookieJar(cookieJar, cookie_jar_file) #import cookielib #cookieJar = cookielib.LWPCookieJar() #print 'cookieJar new',cookieJar elif 'save[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split('save[')[1].split( ']')[0] complete_path = os.path.join(profile, cookie_jar_file) # print 'complete_path',complete_path saveCookieJar(cookieJar, cookie_jar_file) if m['page'] and '$doregex' in m['page']: pg = getRegexParsed(regexs, m['page'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if len(pg) == 0: pg = 'http://regexfailed' m['page'] = pg if 'setcookie' in m and m['setcookie'] and '$doregex' in m[ 'setcookie']: m['setcookie'] = getRegexParsed(regexs, m['setcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[ 'appendcookie']: m['appendcookie'] = getRegexParsed(regexs, m['appendcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'post' in m and '$doregex' in m['post']: m['post'] = getRegexParsed(regexs, m['post'], cookieJar, recursiveCall=True, cachedPages=cachedPages) # print 'post is now',m['post'] if 'rawpost' in m and '$doregex' in m['rawpost']: m['rawpost'] = getRegexParsed(regexs, m['rawpost'], cookieJar, recursiveCall=True, cachedPages=cachedPages, rawPost=True) #print 'rawpost is now',m['rawpost'] if 'rawpost' in m and '$epoctime$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime$', getEpocTime()) if 'rawpost' in m and '$epoctime2$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime2$', getEpocTime2()) link = '' if m['page'] and m[ 'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False: #print 'using cache page',m['page'] link = cachedPages[m['page']] else: if m['page'] and not m['page'] == '' and m['page'].startswith( 'http'): if '$epoctime$' in m['page']: m['page'] = m['page'].replace('$epoctime$', getEpocTime()) if '$epoctime2$' in m['page']: m['page'] = m['page'].replace('$epoctime2$', getEpocTime2()) #print 'Ingoring Cache',m['page'] page_split = m['page'].split('|') pageUrl = page_split[0] header_in_page = None if len(page_split) > 1: header_in_page = page_split[1] # if # proxy = urllib2.ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse}) # opener = urllib2.build_opener(proxy) # urllib2.install_opener(opener) # import urllib2 # print 'urllib2.getproxies',urllib2.getproxies() current_proxies = urllib2.ProxyHandler( urllib2.getproxies()) #print 'getting pageUrl',pageUrl req = urllib2.Request(pageUrl) if 'proxy' in m: proxytouse = m['proxy'] # print 'proxytouse',proxytouse # urllib2.getproxies= lambda: {} if pageUrl[:5] == "https": proxy = urllib2.ProxyHandler({'https': proxytouse}) #req.set_proxy(proxytouse, 'https') else: proxy = urllib2.ProxyHandler({'http': proxytouse}) #req.set_proxy(proxytouse, 'http') opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1' ) proxytouse = None if 'referer' in m: req.add_header('Referer', m['referer']) if 'accept' in m: req.add_header('Accept', m['accept']) if 'agent' in m: req.add_header('User-agent', m['agent']) if 'x-req' in m: req.add_header('X-Requested-With', m['x-req']) if 'x-addr' in m: req.add_header('x-addr', m['x-addr']) if 'x-forward' in m: req.add_header('X-Forwarded-For', m['x-forward']) if 'setcookie' in m: # print 'adding cookie',m['setcookie'] req.add_header('Cookie', m['setcookie']) if 'appendcookie' in m: # print 'appending cookie to cookiejar',m['appendcookie'] cookiestoApend = m['appendcookie'] cookiestoApend = cookiestoApend.split(';') for h in cookiestoApend: n, v = h.split('=') w, n = n.split(':') ck = cookielib.Cookie(version=0, name=n, value=v, port=None, port_specified=False, domain=w, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cookieJar.set_cookie(ck) if 'origin' in m: req.add_header('Origin', m['origin']) if header_in_page: header_in_page = header_in_page.split('&') for h in header_in_page: n, v = h.split('=') req.add_header(n, v) if not cookieJar == None: # print 'cookieJarVal',cookieJar cookie_handler = urllib2.HTTPCookieProcessor(cookieJar) opener = urllib2.build_opener( cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) # print 'noredirect','noredirect' in m if 'noredirect' in m: opener = urllib2.build_opener( cookie_handler, NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) elif 'noredirect' in m: opener = urllib2.build_opener( NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) if 'connection' in m: # print '..........................connection//////.',m['connection'] from keepalive import HTTPHandler keepalive_handler = HTTPHandler() opener = urllib2.build_opener(keepalive_handler) urllib2.install_opener(opener) #print 'after cookie jar' post = None if 'post' in m: postData = m['post'] #if '$LiveStreamRecaptcha' in postData: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield) splitpost = postData.split(',') post = {} for p in splitpost: n = p.split(':')[0] v = p.split(':')[1] post[n] = v post = urllib.urlencode(post) if 'rawpost' in m: post = m['rawpost'] #if '$LiveStreamRecaptcha' in post: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield) link = '' try: if post: response = urllib2.urlopen(req, post) else: response = urllib2.urlopen(req) if response.info().get('Content-Encoding') == 'gzip': from StringIO import StringIO import gzip buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) link = f.read() else: link = response.read() if 'proxy' in m and not current_proxies is None: urllib2.install_opener( urllib2.build_opener(current_proxies)) link = javascriptUnEscape(link) #print repr(link) #print link This just print whole webpage in LOG if 'includeheaders' in m: #link+=str(response.headers.get('Set-Cookie')) link += '$$HEADERS_START$$:' for b in response.headers: link += b + ':' + response.headers.get( b) + '\n' link += '$$HEADERS_END$$:' # print link response.close() except: pass cachedPages[m['page']] = link #print link #print 'store link for',m['page'],forCookieJarOnly if forCookieJarOnly: return cookieJar # do nothing elif m['page'] and not m['page'].startswith('http'): if m['page'].startswith('$pyFunction:'): val = doEval(m['page'].split('$pyFunction:')[1], '', cookieJar, m) if forCookieJarOnly: return cookieJar # do nothing link = val link = javascriptUnEscape(link) else: link = m['page'] if '$doregex' in m['expres']: m['expres'] = getRegexParsed(regexs, m['expres'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if not m['expres'] == '': #print 'doing it ',m['expres'] if '$LiveStreamCaptcha' in m['expres']: val = askCaptcha(m, link, cookieJar) #print 'url and val',url,val url = url.replace("$doregex[" + k + "]", val) elif m['expres'].startswith( '$pyFunction:') or '#$pyFunction' in m['expres']: #print 'expeeeeeeeeeeeeeeeeeee',m['expres'] val = '' if m['expres'].startswith('$pyFunction:'): val = doEval(m['expres'].split('$pyFunction:')[1], link, cookieJar, m) else: val = doEvalFunction(m['expres'], link, cookieJar, m) if 'ActivateWindow' in m['expres']: return if forCookieJarOnly: return cookieJar # do nothing if 'listrepeat' in m: listrepeat = m['listrepeat'] return listrepeat, eval(val), m, regexs, cookieJar try: url = url.replace(u"$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) else: if 'listrepeat' in m: listrepeat = m['listrepeat'] ret = re.findall(m['expres'], link) return listrepeat, ret, m, regexs val = '' if not link == '': #print 'link',link reg = re.compile(m['expres']).search(link) try: val = reg.group(1).strip() except: traceback.print_exc() elif m['page'] == '' or m['page'] == None: val = m['expres'] if rawPost: # print 'rawpost' val = urllib.quote_plus(val) if 'htmlunescape' in m: #val=urllib.unquote_plus(val) import HTMLParser val = HTMLParser.HTMLParser().unescape(val) try: url = url.replace("$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) #print 'ur',url #return val else: url = url.replace("$doregex[" + k + "]", '') if '$epoctime$' in url: url = url.replace('$epoctime$', getEpocTime()) if '$epoctime2$' in url: url = url.replace('$epoctime2$', getEpocTime2()) if '$GUID$' in url: import uuid url = url.replace('$GUID$', str(uuid.uuid1()).upper()) if '$get_cookies$' in url: url = url.replace('$get_cookies$', getCookiesString(cookieJar)) if recursiveCall: return url #print 'final url',repr(url) if url == "": return else: return url, setresolved
import urllib import re import threading import HTMLParser concmd = ['/load_blacklist'] blacklist_lock = threading.Lock() blacklist = None html_unescape = HTMLParser.HTMLParser().unescape def load_blacklist(): global blacklist, blacklist_lock blacklist_lock.acquire() blacklist = [] f = open("blacklist.txt", 'r') for line in f: while line != '' and line[-1] == '\n': line = line[:-1] if line != '': blacklist.append(re.compile('^' + line + '$')) f.close() blacklist_lock.release() def matchprotocol(string, protocol): return len(protocol) <= len(string) and string[:len(protocol)] == protocol
def __init__(self, idMembro, cvLattesHTML): HTMLParser.__init__(self) # inicializacao obrigatoria self.idMembro = idMembro self.sexo = 'Masculino' self.item = '' self.listaIDLattesColaboradores = [] self.listaFormacaoAcademica = [] self.listaProjetoDePesquisa = [] self.listaAreaDeAtuacao = [] self.listaIdioma = [] self.listaPremioOuTitulo = [] self.listaArtigoEmPeriodico = [] self.listaLivroPublicado = [] self.listaCapituloDeLivroPublicado = [] self.listaTextoEmJornalDeNoticia = [] self.listaTrabalhoCompletoEmCongresso = [] self.listaResumoExpandidoEmCongresso = [] self.listaResumoEmCongresso = [] self.listaArtigoAceito = [] self.listaApresentacaoDeTrabalho = [] self.listaOutroTipoDeProducaoBibliografica = [] self.listaSoftwareComPatente = [] self.listaSoftwareSemPatente = [] self.listaProdutoTecnologico = [] self.listaProcessoOuTecnica = [] self.listaTrabalhoTecnico = [] self.listaOutroTipoDeProducaoTecnica = [] self.listaProducaoArtistica = [] self.listaOASupervisaoDePosDoutorado = [] self.listaOATeseDeDoutorado = [] self.listaOADissertacaoDeMestrado = [] self.listaOAMonografiaDeEspecializacao = [] self.listaOATCC = [] self.listaOAIniciacaoCientifica = [] self.listaOAOutroTipoDeOrientacao = [] self.listaOCSupervisaoDePosDoutorado = [] self.listaOCTeseDeDoutorado = [] self.listaOCDissertacaoDeMestrado = [] self.listaOCMonografiaDeEspecializacao = [] self.listaOCTCC = [] self.listaOCIniciacaoCientifica = [] self.listaOCOutroTipoDeOrientacao = [] self.listaParticipacaoEmEvento = [] self.listaOrganizacaoDeEvento = [] # inicializacao para evitar a busca exaustiva de algumas palavras-chave self.salvarAtualizacaoCV = 1 self.salvarFoto = 1 self.procurarCabecalho = 0 self.achouGrupo = 0 self.doi = '' self.relevante = 0 self.idOrientando = '' # contornamos alguns erros do HTML da Plataforma Lattes cvLattesHTML = cvLattesHTML.replace("<![CDATA[","") cvLattesHTML = cvLattesHTML.replace("]]>","") # feed it! cvLattesHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities':1}) #print errors #print cvLattesHTML.encode("utf8") ## tentativa errada (não previsível) # options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0) # cvLattesHTML = str(tidy.parseString(cvLattesHTML, **options)).decode("utf8") self.feed(cvLattesHTML)
def feed(self, data): HTMLParser.feed(self, data) return self.root