Esempio n. 1
0
 def __init__(self):
   HTMLParser.__init__(self)
   self.withinlinkdiv = -1
   self.current_value = 0 # 0 = nothing, 1 = votes, 2 = title this is used to
                          # link the data and the tag
   self.tempdata = [0, '', '', ''] # votes, title, link temporary, comment link, before it is put
                               # into the submission 
   self.sublist = [] # sublist for the submissions
Esempio n. 2
0
    def __init__(self, ldomain, scandpth, lps):
        HTMLParser.__init__(self)
        self.url = ldomain
        self.db = {self.url: 1}
        self.node = [self.url]
 
        self.depth = scandpth 
        self.max_span = lps 
        self.links_found = 0
	def __init__(self, idMembro, cvLattesXML):
		HTMLParser.__init__(self)

		# inicializacao obrigatoria
		self.idMembro = idMembro

		self.item = ''
		self.listaIDLattesColaboradores = []
		self.listaFormacaoAcademica = []
		self.listaProjetoDePesquisa = []
		self.listaAreaDeAtuacao = []
		self.listaIdioma = []
		self.listaPremioOuTitulo = []

		self.listaArtigoEmPeriodico = []
		self.listaLivroPublicado = []
		self.listaCapituloDeLivroPublicado = []
		self.listaTextoEmJornalDeNoticia = []
		self.listaTrabalhoCompletoEmCongresso = []
		self.listaResumoExpandidoEmCongresso = []
		self.listaResumoEmCongresso = []
		self.listaArtigoAceito = []
		self.listaApresentacaoDeTrabalho = []
		self.listaOutroTipoDeProducaoBibliografica = []

		self.listaSoftwareComPatente = []
		self.listaSoftwareSemPatente = []
		self.listaProdutoTecnologico = []
		self.listaProcessoOuTecnica = []
		self.listaTrabalhoTecnico = []
		self.listaOutroTipoDeProducaoTecnica = []
		self.listaProducaoArtistica = []

		self.listaOASupervisaoDePosDoutorado = []
		self.listaOATeseDeDoutorado = []
		self.listaOADissertacaoDeMestrado = []
		self.listaOAMonografiaDeEspecializacao = []
		self.listaOATCC = []
		self.listaOAIniciacaoCientifica = []
		self.listaOAOutroTipoDeOrientacao = []

		self.listaOCSupervisaoDePosDoutorado = []
		self.listaOCTeseDeDoutorado = []
		self.listaOCDissertacaoDeMestrado = []
		self.listaOCMonografiaDeEspecializacao = []
		self.listaOCTCC = []
		self.listaOCIniciacaoCientifica = []
		self.listaOCOutroTipoDeOrientacao = []

		# inicializacao 
		self.idLattes = ''
		self.url      = ''
		self.foto     = ''

		# feed it!
		# print cvLattesXML #.encode("utf8")
		self.feed(cvLattesXML)
Esempio n. 4
0
def topTen(win, feedUrl, background, foreground):
    feed = urlopen(feedUrl).read()
    feedTitles = re.findall(r'<title>(.*?)</title>', feed)
    for index, title in enumerate(feedTitles[2:12]):
        parser = HTMLParser()
        text = parser.unescape(title)
        displayText = "{}. {}".format(index + 1, text)
        w = Label(win, text=displayText)
        w.config(bg=background, fg=foreground)
        w.pack()
    x = Label(win, text=feedUrl)
    x.config(bg=background, fg=foreground)
    x.pack()
    closure = lambda: save(feedTitles)
    Button(win, text="Save", command=closure).pack(side="right", expand=True, fill=X)
Esempio n. 5
0
	def run(self, edit):
		for s in self.view.sel():
			if s.empty():
				s = self.view.word(s)

			selected = unicode(self.view.substr(s))
			import HTMLParser
			HTMLParser = HTMLParser.HTMLParser()
			selected = HTMLParser.unescape(selected)
			self.view.replace(edit, s, selected);
Esempio n. 6
0
def top10(url, regex):
    #Download html code of web page
    html_code = download_HTML_code(url)

    #Find relevant data stored in html code
    _top10_ = findall(regex, html_code)

    #create a list that stores unicode string data
    top10_unescaped = []
    h = HTMLParser()
    #Index of list
    counter = 0
    for index in _top10_:
        counter= counter + 1

        top10_unescaped.append(h.unescape(index))
        #Dont take more than 10 elements to array
        if counter == 10:
            break
    #return top10 data list
    return top10_unescaped
Esempio n. 7
0
def strip_tags(html):
    from HTMLParser import HTMLParser
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Esempio n. 8
0
 def strip_tags(self, html):
     '''
     清洗标签
     '''
     html = html.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result).strip()
	def __init__(self, rawDOIhtml):
		HTMLParser.__init__(self)
		self.dadosDaPublicacao = ""
		self.feed(rawDOIhtml)
u = urllib.urlopen(urlString)
#print u.info()

lParser.feed(u.read())
lParser.close()

#---------------------------------
#this part displays all the links
#---------------------------------

import urllib
urllib.urlretrieve( 'http://www.comicsalliance.com/2011/03/11/best-art-ever-this-week-3-10-11/', '/tmp/CAmain.htm' )
from htmllib import HTMLParser #htmllib has been deprecated in favor or HTMLParser, HTMLParser has been renamed html.parser
from formatter import NullFormatter
parser= HTMLParser( NullFormatter( ) )
parser.feed( open( '/tmp/CAmain.htm' ).read( ) )


#import urlparse

linecounter=0
for a in parser.anchorlist:
#    print urlparse.urljoin( 'http://python.org/', a )
    if linecounter>105 and linecounter<145:
        print(a)
    linecounter+=1



"""
Esempio n. 11
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.started=True
     self.bad=False
     self.text=[]
     self.urls=[]
Esempio n. 12
0
import time
import logging
import random
import functools
import os
import tempfile

import commonware.log
import lockfile
from polib import pofile

from django.conf import settings
from django.core.cache import get_cache

log = commonware.log.getLogger('mdn.devmo.utils')
htmlparser = HTMLParser.HTMLParser()


def strings_are_translated(strings, locale):
    # http://stackoverflow.com/a/24339946/571420
    pofile_path = os.path.join(settings.ROOT, 'locale', locale, 'LC_MESSAGES',
                               'messages.po')
    try:
        po = pofile(pofile_path)
    except IOError:  # in case the file doesn't exist or couldn't be parsed
        return False
    all_strings_translated = True
    for string in strings:
        if not any(e for e in po if e.msgid == string and (
                e.translated() and 'fuzzy' not in e.flags) and not e.obsolete):
            all_strings_translated = False
Esempio n. 13
0
 def unescape(self, string):
     try:
         pars = HTMLParser.HTMLParser()
         return pars.unescape(string)
     except:
         return string
Esempio n. 14
0
 def __init__(self):
     self.root = None
     self.tree = []
     HTMLParser.__init__(self)
Esempio n. 15
0
r4 = re.compile("title=[A-Z][_:a-zA-Z0-9]*")

pagename = "Main_Page"
if os.environ.has_key('QUERY_STRING'):
    qs = os.environ['QUERY_STRING']
    m = r4.search(qs)
    if m is not None:
        pagename = qs[m.start() + 6:m.end()]

if DEBUG:
    print 'pagename=' + pagename

url = "http://localhost/wiki/index.php?title=" + pagename + "&action=edit"
R = urllib.urlopen(url).read()
R = R[r1.search(R).end():r2.search(R).start()]
R = HTMLParser.HTMLParser().unescape(R).split('\n')
if DEBUG:
    print R
    print '=========================='

rdfState = 0
for L in R:
    m = r3.search(L)
    if rdfState == 0 and m is not None:
        rdfState = 1
    elif rdfState == 1 and L == '<pre>':
        rdfState = 2
    elif L == '</pre>':
        rdfState = 0
    elif rdfState == 2:
        print L
Esempio n. 16
0
def clean_tweet(tweet):

    more_stop_words = ['rt', 'cant','didnt','doesnt','dont','goes','isnt','hes','shes','thats','theres',\
       'theyre','wont','youll','youre','youve', 'br', 've', 're', 'vs', 'goes','isnt',\
       'hes', 'shes','thats','theres','theyre','wont','youll','youre','youve', 'br',\
                      've', 're', 'vs', 'this', 'i', 'get','cant','didnt','doesnt','dont','goes','isnt','hes',\
       'shes','thats','theres','theyre','wont','youll','youre','youve', 'br', 've', 're', 'vs']

    # start with the initial list and add the additional words to it.
    stoplist = nltk.corpus.stopwords.words('english') + more_stop_words

    # define list of codes to be dropped from document
    # carriage-returns, line-feeds, tabs
    codelist = ['\r', '\n', '\t']

    # insert a space at the beginning and end of the tweet
    # tweet = ' ' + tweet + ' '

    tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet)
    tweet = re.sub('http[^\\s]+', ' ', tweet)
    tweet = re.sub(r"\[", '', tweet)
    tweet = re.sub(r"\]", '', tweet)
    tweet = re.sub(r"'rt", '', tweet)
    tweet = re.sub(r'\'', '', tweet)
    tweet = re.sub(r'\'\,', '', tweet)
    tweet = re.sub(r'\,\'', '', tweet)
    tweet = re.sub('rt[^\\s]+', '', tweet)
    tweet = re.sub(r"' ,", '', tweet)
    tweet = re.sub(r"\' ,", '', tweet)
    tweet = re.sub(r", ',',", '', tweet)
    tweet = re.sub(r"\,", '', tweet)
    tweet = re.sub(r"\, \"\'\"\,", '', tweet)
    tweet = re.sub(r"\, \"\' \,\"\,", '', tweet)
    tweet = re.sub(r"\, \"\'\ \,\"\,", '', tweet)
    tweet = re.sub(r"\,\ \"\'\"\,", '', tweet)
    tweet = re.sub(r"\,", '', tweet)
    tweet = re.sub(r"\"", '', tweet)
    tweet = re.sub(r"\'", '', tweet)
    tweet = re.sub(r"\'\,", '', tweet)
    tweet = re.sub(r'"', '', tweet)
    tweet = re.sub(",", '', tweet)

    temp_tweet = re.sub('[^a-zA-Z]', ' ',
                        tweet)  # replace non-alphanumeric with space
    html_parser = HTMLParser.HTMLParser()
    tweet = html_parser.unescape(tweet)
    # temp_tweet = re.sub('\d', '  ', temp_tweet)

    for i in range(len(codelist)):
        stopstring = ' ' + codelist[i] + '  '
        temp_tweet1 = re.sub(stopstring, '  ', temp_tweet)

    # convert uppercase to lowercase
    temp_tweet = temp_tweet1.lower()

    # replace single-character words with space
    temp_tweet = re.sub('\s.\s', ' ', temp_tweet)

    # replace selected character strings/stop-words with space
    for i in range(len(stoplist)):
        stopstring = ' ' + str(stoplist[i]) + ' '
        temp_tweet = re.sub(stopstring, ' ', temp_tweet)

    # replace multiple blank characters with one blank character
    temp_tweet = re.sub('\s+', ' ', temp_tweet)

    return (temp_tweet)
Esempio n. 17
0
import HTMLParser
import sys
import re
import numpy as np
#from New_Utils import *

html_parser = HTMLParser.HTMLParser()
reload(sys)
sys.setdefaultencoding('utf8')
#Dictionary for mapping contractions
APPOSTOPHES={
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
Esempio n. 18
0
def getCarnegieGenerator():
    """
    Generator to return Carnegie Museum of Art paintings
    """

    urls = []

    # TODO: Reimplement based on https://collection.cmoa.org/?classification=paintings&page=2&perPage=10&withImage=0
    # It's now JSON and contains images

    # First get the Painting

    # And let's get the paintings too

    apiurl = u'http://collection.cmoa.org/CollectionSearch.aspx/GetSearchResults'
    postjson = u'{"SearchText": "", "Nationality": "Any", "DateRange": "Any", "Classification": "%s", "Theme": "Any", "Department": "Any", "Location": "Any", "WithImages": "false", "WithVideo": "false", "WithAudio": "false", "TeenieHarris": "false", "SortOrder": "alpha-artist", "PageNumber": "%s", "NumberPerPage": "48", "PriorParams": "%s"}'
    #postjson = u'{"SearchText": "", "Nationality": "Any", "DateRange": "Any", "Classification": "%s", "Theme": "Any", "Department": "Any", "Location": "Any", "WithImages": "false", "WithVideo": "false", "WithAudio": "false", "TeenieHarris": "false", "SortOrder": "alpha-artist", "PageNumber": "%s", "NumberPerPage": "48", "PriorParams": "Y2xhc3NpZmljYXRpb249UGFpbnRpbmd8"}'
    referer = u'http://collection.cmoa.org/collection-search/'
    #for classification in [u'Painting', u'paintings']:
    #firsturl = u'http://collection.cmoa.org/collection-search/'

    htmlparser = HTMLParser.HTMLParser()

    session = requests.Session()
    searchPage = session.get(referer, verify=False)

    urlregex = u'\<a href\=\"(CollectionDetail\.aspx\?item\=\d+)\&'

    tosearch = [  #(u'Painting', 12, u'Y2xhc3NpZmljYXRpb249UGFpbnRpbmd8'), # 488, 48 per page
        (u'paintings', 14,
         u'Y2xhc3NpZmljYXRpb249cGFpbnRpbmdzfA9999'),  # 605, 48 per page
    ]
    for (classification, endpage, priorsearch) in tosearch:
        for i in range(1, endpage):
            try:
                searchpage = session.post(
                    apiurl,
                    data=postjson % (
                        classification,
                        i,
                        priorsearch,
                    ),
                    headers={
                        'X-Requested-With': 'XMLHttpRequest',
                        'referer': referer,
                        u'Content-Type': u'application/json; charset=utf-8'
                    })
            except requests.exceptions.ConnectionError:
                pywikibot.output(
                    u'Could not get the search page. Sleeping and trying again'
                )
                time.sleep(60)
                searchpage = session.post(
                    apiurl,
                    data=postjson % (
                        classification,
                        i,
                        priorsearch,
                    ),
                    headers={
                        'X-Requested-With': 'XMLHttpRequest',
                        'referer': referer,
                        u'Content-Type': u'application/json; charset=utf-8'
                    })
            print apiurl
            #print postjson % (classification, i,)
            print searchpage.text
            searchjson = searchpage.json()
            matches = re.finditer(urlregex, searchjson.get(u'd'))
            for match in matches:
                metadata = {}
                url = u'http://collection.cmoa.org/%s' % (match.group(1), )

                # Museum site probably doesn't like it when we go fast
                time.sleep(5)

                pywikibot.output(url)

                itempage = requests.get(url)
                metadata['url'] = url

                metadata['collectionqid'] = u'Q1043967'
                metadata['collectionshort'] = u'CMoA'
                metadata['locationqid'] = u'Q1043967'

                #No need to check, I'm actually searching for paintings.
                metadata['instanceofqid'] = u'Q3305213'

                titlecreatorregex = u'\<div id\=\"detail-data-container\"\>[\s\t\r\n]*\<hgroup class\=\"page-titles\"\>[\s\t\r\n]*\<h1 class\=\"italic\"\>(?P<title>[^\<]+)\<\/h1\>[\s\t\r\n]*\<h2 class\=\"sub1\"\>(?P<name>[^\<]+)\<\/h2\>[\s\t\r\n]*\<h2 class\=\"sub2\"\>(?P<date>[^\<]+)?\<\/h2\>'
                titlecreatormatch = re.search(titlecreatorregex, itempage.text)

                title = htmlparser.unescape(
                    titlecreatormatch.group(u'title').strip())
                name = htmlparser.unescape(
                    titlecreatormatch.group(u'name').strip())

                # Chop chop, in case we have very long titles
                if len(title) > 220:
                    title = title[0:200]
                metadata['title'] = {
                    u'en': title,
                }
                metadata['creatorname'] = name
                metadata['description'] = {
                    u'en': u'painting by %s' % (name, ),
                    u'nl': u'schilderij van %s' % (name, ),
                }

                if titlecreatormatch.group(u'date'):
                    metadata['inception'] = htmlparser.unescape(
                        titlecreatormatch.group(u'date').strip())

                idregex = u'\<span class\=\"label\"\>Accession Number\<\/span\>[\s\t\r\n]*\<span class\=\"value\"\>([^\<]+)\<\/span\>'
                idmatch = re.search(idregex, itempage.text)
                metadata['idpid'] = u'P217'
                metadata['id'] = idmatch.group(1).strip()

                mediumregex = u'\<span class\=\"label\"\>Medium\<\/span\>[\s\t\r\n]*\<span class\=\"value\"\>oil on canvas\<\/span\>'
                mediummatch = re.search(mediumregex, itempage.text)
                if mediummatch:
                    metadata['medium'] = u'oil on canvas'

                dimensionsregex = u'\<span class\=\"label\"\>Measurements\<\/span\>[\s\t\r\n]*\<span class\=\"value\"\>([^\<]+)\<\/span\>'
                dimensionsmatch = re.search(dimensionsregex, itempage.text)

                if dimensionsmatch:
                    dimensiontext = dimensionsmatch.group(1).strip()
                    regex_2d = u'.+\((?P<height>\d+(\.\d+)?) x (?P<width>\d+(\.\d+)?) cm\)$'
                    regex_3d = u'.+\((?P<height>\d+(\.\d+)?) x (?P<width>\d+(\.\d+)?) x (?P<depth>\d+(\.\d+)?) cm\)$'
                    match_2d = re.match(regex_2d, dimensiontext)
                    match_3d = re.match(regex_3d, dimensiontext)
                    if match_2d:
                        metadata['heightcm'] = match_2d.group(u'height')
                        metadata['widthcm'] = match_2d.group(u'width')
                    elif match_3d:
                        metadata['heightcm'] = match_3d.group(u'height')
                        metadata['widthcm'] = match_3d.group(u'width')
                        metadata['depthcm'] = match_3d.group(u'depth')

                # https://collection.cmoa.org/objects/b1357c35-c930-4e15-9d8a-e50bba6bd03a
                # https://cmoa-collection-images.s3.amazonaws.com/133250/sizes/1035976-840.jpg
                # https://cmoa-collection-images.s3.amazonaws.com/133250/1035976.jpg

                yield metadata
Esempio n. 19
0
 def unescape(self, text):
     if (sys.version_info[0] < 3):
         parser = HTMLParser.HTMLParser()
     else:
         parser = html.parser.HTMLParser()
     return (parser.unescape(text))
Esempio n. 20
0
        os.path.join(Addon.getAddonInfo('path'), r'resources', r'lib'))
    from BeautifulSoup import BeautifulSoup
except:
    try:
        sys.path.insert(
            0, os.path.join(Addon.getAddonInfo('path'), r'resources', r'lib'))
        from BeautifulSoup import BeautifulSoup
    except:
        sys.path.append(os.path.join(os.getcwd(), r'resources', r'lib'))
        from BeautifulSoup import BeautifulSoup
        icon = xbmc.translatePath(
            os.path.join(os.getcwd().replace(';', ''), 'icon.png'))

import HTMLParser

hpar = HTMLParser.HTMLParser()

h = int(sys.argv[1])


def showMessage(heading, message, times=3000):
    xbmc.executebuiltin('XBMC.Notification("%s", "%s", %s, "%s")' %
                        (heading, message, times, icon))


#---------- parameter/info structure -------------------------------------------
class Param:
    page = '1'
    genre = ''
    genre_name = ''
    max_page = 0
Esempio n. 21
0
    def _doSearch(self,
                  search_strings,
                  search_mode='eponly',
                  epcount=0,
                  age=0,
                  epObj=None):
        results = []
        items = {'Season': [], 'Episode': [], 'RSS': []}

        for mode in search_strings.keys():
            logger.log(u"Search Mode: %s" % mode, logger.DEBUG)
            for search_string in search_strings[mode]:

                self.search_params.update({
                    'q':
                    search_string.encode('utf-8'),
                    'field': ('seeders', 'time_add')[mode == 'RSS']
                })

                if mode != 'RSS':
                    logger.log(u"Search string: %s" % search_string,
                               logger.DEBUG)

                try:
                    searchURL = self.urls[('search', 'rss')[
                        mode == 'RSS']] + '?' + urlencode(self.search_params)
                    logger.log(u"Search URL: %s" % searchURL, logger.DEBUG)
                    data = self.getURL(searchURL)
                    #data = self.getURL(self.urls[('search', 'rss')[mode == 'RSS']], params=self.search_params)
                    if not data:
                        logger.log("No data returned from provider",
                                   logger.DEBUG)
                        continue

                    if not data.startswith('<?xml'):
                        logger.log(
                            u'Expected xml but got something else, is your proxy failing?',
                            logger.INFO)
                        continue

                    try:
                        data = xmltodict.parse(
                            HTMLParser.HTMLParser().unescape(
                                data.encode('utf-8')).replace('&', '&amp;'))
                    except ExpatError as e:
                        logger.log(
                            u"Failed parsing provider. Traceback: %r\n%r" %
                            (traceback.format_exc(), data), logger.ERROR)
                        continue

                    if not all([
                            data, 'rss' in data, 'channel' in data['rss'],
                            'item' in data['rss']['channel']
                    ]):
                        logger.log(u"Malformed rss returned, skipping",
                                   logger.DEBUG)
                        continue

                    # https://github.com/martinblech/xmltodict/issues/111
                    entries = data['rss']['channel']['item']
                    entries = entries if isinstance(entries,
                                                    list) else [entries]

                    for item in entries:
                        try:
                            title = item['title'].decode('utf-8')

                            # Use the torcache link kat provides,
                            # unless it is not torcache or we are not using blackhole
                            # because we want to use magnets if connecting direct to client
                            # so that proxies work.
                            download_url = item['enclosure']['@url']
                            if sickbeard.TORRENT_METHOD != "blackhole" or 'torcache' not in download_url:
                                download_url = item['torrent:magnetURI']

                            seeders = int(item['torrent:seeds'])
                            leechers = int(item['torrent:peers'])
                            verified = bool(int(item['torrent:verified']) or 0)
                            size = int(item['torrent:contentLength'])

                            info_hash = item['torrent:infoHash']
                            #link = item['link']

                        except (AttributeError, TypeError, KeyError):
                            continue

                        try:
                            pubdate = datetime.datetime.strptime(
                                item['pubDate'], '%a, %d %b %Y %H:%M:%S +0000')
                        except Exception:
                            pubdate = datetime.datetime.today()

                        if not all([title, download_url]):
                            continue

                        #Filter unseeded torrent
                        if seeders < self.minseed or leechers < self.minleech:
                            if mode != 'RSS':
                                logger.log(
                                    u"Discarding torrent because it doesn't meet the minimum seeders or leechers: {0} (S:{1} L:{2})"
                                    .format(title, seeders,
                                            leechers), logger.DEBUG)
                            continue

                        if self.confirmed and not verified:
                            if mode != 'RSS':
                                logger.log(
                                    u"Found result " + title +
                                    " but that doesn't seem like a verified result so I'm ignoring it",
                                    logger.DEBUG)
                            continue

                        item = title, download_url, size, seeders, leechers
                        if mode != 'RSS':
                            logger.log(u"Found result: %s " % title,
                                       logger.DEBUG)

                        items[mode].append(item)

                except Exception:
                    logger.log(
                        u"Failed parsing provider. Traceback: %r" %
                        traceback.format_exc(), logger.ERROR)

            #For each search mode sort all the items by seeders if available
            items[mode].sort(key=lambda tup: tup[3], reverse=True)

            results += items[mode]

        return results
Esempio n. 22
0
import HTMLParser
import itertools
import random
import string
import time

import requests

from pylons import app_globals as g

from r2.lib.db import queries
from r2.lib import amqp
from r2.lib.utils import weighted_lottery, get_requests_resp_json
from r2.models import Account, NotFound, register, Subreddit, Link, Comment

unescape_htmlentities = HTMLParser.HTMLParser().unescape


class TextGenerator(object):
    """A Markov Chain based text mimicker."""
    def __init__(self, order=8):
        self.order = order
        self.starts = collections.Counter()
        self.start_lengths = collections.defaultdict(collections.Counter)
        self.models = [
            collections.defaultdict(collections.Counter)
            for i in xrange(self.order)
        ]

    @staticmethod
    def _in_groups(input_iterable, n):
Esempio n. 23
0
def clean_content(content):
    soup= BeautifulSoup(content)
    content = soup.text.strip()
    h = HTMLParser.HTMLParser()
    return h.unescape(content)
Esempio n. 24
0
def replaceHTMLCodes(txt):
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser.HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    return txt
 def __init__(self):
   HTMLParser.__init__(self)
   self.recording = 0
   self.data = []
Esempio n. 26
0
def replaceEscapeCodes(txt):
    txt = HTMLParser.HTMLParser().unescape(txt)
    return txt
Esempio n. 27
0
# Outputs JSON to STDOUT. Run and save with:
#  ./run structure > structure.json
#
# options:
#   year: "uscprelim" (the default), or a specific year version of the Code (e.g. "2011")
#   title: Do only a specific title (e.g. "5", "5a", "25")
#   sections: Return a flat hierarchy of only titles and sections (no intervening layers)
#   debug: Output debug messages only, and no JSON output (dry run)
#   force: Force a re-download of the US Code for the given year (script defaults to caching if the directory for a year is present)

import glob, re, lxml.html, json, sys, os

import utils

import HTMLParser
pars = HTMLParser.HTMLParser()

section_symbol = u'\xa7'


def run(options):
    year = options.get("year", "uscprelim")  # default to USCprelim

    # optional: don't print json out, just --debug information
    debug = options.get('debug', False)

    # optional: limit to a specific --title
    title = options.get("title", None)
    if title:
        # appendix cites may look like "5a" but we need "05a" to match the file
        if title.endswith("a"):
Esempio n. 28
0
from functools import partial
from future_builtins import filter, map, zip
from multiprocessing.pool import ThreadPool
from xml.sax.saxutils import escape, quoteattr
# }}}

USER_AGENT = 'calibre mirror'
MR_URL = 'https://www.mobileread.com/forums/'
IS_PRODUCTION = os.path.exists('/srv/plugins')
WORKDIR = '/srv/plugins' if IS_PRODUCTION else '/t/plugins'
PLUGINS = 'plugins.json.bz2'
INDEX = MR_URL + 'showpost.php?p=1362767&postcount=1'
# INDEX = 'file:///t/raw.html'

IndexEntry = namedtuple('IndexEntry', 'name url donate history uninstall deprecated thread_id')
u = HTMLParser.HTMLParser().unescape

socket.setdefaulttimeout(30)


def read(url, get_info=False):  # {{{
    if url.startswith("file://"):
        return urllib2.urlopen(url).read()
    opener = urllib2.build_opener()
    opener.addheaders = [
        ('User-Agent', USER_AGENT),
        ('Accept-Encoding', 'gzip,deflate'),
    ]
    # Sporadic network failures in rackspace, so retry with random sleeps
    for i in range(10):
        try:
Esempio n. 29
0
def filter_cases(request, domain, app_id, module_id):
    app = Application.get(app_id)
    module = app.get_module(module_id)
    delegation = request.GET.get('task-list') == 'true'
    auth_cookie = request.COOKIES.get('sessionid')

    suite_gen = SuiteGenerator(app)
    xpath = suite_gen.get_filter_xpath(module, delegation=delegation)
    extra_instances = [{
        'id': inst.id,
        'src': inst.src
    } for inst in suite_gen.get_extra_instances(module)]

    # touchforms doesn't like this to be escaped
    xpath = HTMLParser.HTMLParser().unescape(xpath)
    if delegation:
        case_type = DELEGATION_STUB_CASE_TYPE
    else:
        case_type = module.case_type

    if xpath:
        # if we need to do a custom filter, send it to touchforms for processing
        additional_filters = {
            "properties/case_type": case_type,
            "footprint": True
        }

        helper = SessionDataHelper(domain, request.couch_user)
        result = helper.filter_cases(xpath,
                                     additional_filters,
                                     DjangoAuth(auth_cookie),
                                     extra_instances=extra_instances)
        if result.get('status', None) == 'error':
            return HttpResponseServerError(
                result.get("message",
                           _("Something went wrong filtering your cases.")))

        case_ids = result.get("cases", [])
    else:
        # otherwise just use our built in api with the defaults
        case_ids = [
            res.id
            for res in get_filtered_cases(domain,
                                          status=CASE_STATUS_OPEN,
                                          case_type=case_type,
                                          user_id=request.couch_user._id,
                                          ids_only=True)
        ]

    cases = [
        CommCareCase.wrap(doc)
        for doc in iter_docs(CommCareCase.get_db(), case_ids)
    ]
    # refilter these because we might have accidentally included footprint cases
    # in the results from touchforms. this is a little hacky but the easiest
    # (quick) workaround. should be revisted when we optimize the case list.
    cases = filter(lambda c: c.type == case_type, cases)
    cases = [c.get_json(lite=True) for c in cases if c]
    parents = []
    if delegation:
        for case in cases:
            parent_id = case['indices']['parent']['case_id']
            parents.append(CommCareCase.get(parent_id))
        return json_response({'cases': cases, 'parents': parents})
    else:
        return json_response(cases)
Esempio n. 30
0
# encoding: utf-8

import tweepy  #https://github.com/tweepy/tweepy
import csv
import string
import markovify
import shelve
import random
import time
import HTMLParser
from datetime import datetime

# READ THE README.MD!
import credentials as creds

h = HTMLParser.HTMLParser()

#Twitter API credentials
consumer_key = creds.consumer_key
consumer_secret = creds.consumer_secret
access_key = creds.access_key
access_secret = creds.access_secret

done_ids = shelve.open('parsed_ids')

#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

Esempio n. 31
0
import difflib
import SendEmail
import re
import HTMLParser


current_file = open("Adoptable Animals.html", "r")
new_file = open("Adoptable Animals Mod.html", "r")

email = ""
for line in difflib.unified_diff(str(current_file.readlines()), str(new_file.readlines())):
     if not re.match(r'-', line):
          email += str(line.strip('+-!'))

name_list = HTMLParser.parse_html(email)
SendEmail.authorize_and_send_message(str(name_list))

print(name_list)
Esempio n. 32
0
import requests
import xbmcgui
import urllib2
import urllib
import time
import json
import xbmc
import sys
import re

_handle = int(sys.argv[1])
addon_id = 'plugin.video.cerebrozstrictlyhd'
addon = xbmcaddon.Addon(id=addon_id)
base = 'http://moviegrabber.tv'
query_url = 'http://moviegrabber.tv/searchaskforapi/?id=%s'
regulate = HTMLParser.HTMLParser()


def RESOLVE(link):
    try:
        play_link = urlresolver.HostedMediaFile(link).resolve()
    except:
        play_link = link
    if 'False' in str(play_link): play_link = link

    play_item = xbmcgui.ListItem(path=play_link)
    play_item.setProperty('IsPlayable', 'true')
    xbmcplugin.setResolvedUrl(_handle, True, listitem=play_item)


def cleanHex(s):
Esempio n. 33
0
f = open('facebooktext.txt', 'w')
f.write(html)
f.close()

matches = re.findall(r"<p>([a-zA-Z0-9 !-)(?.,;:'\"_+= -]*)",
                     open('facebooktext.txt', 'r').read())
os.remove('C:\Python27\mockingbird\\facebooktext.txt')

with open('facebook_dictionary.json', 'r') as fb:
    fb_dictionary = json.load(fb)

unknown = open('unknown_facebook_words', 'a')
f = open('facebookwords.txt', 'a')
for match in matches:
    stripped_text = HTMLParser.HTMLParser().unescape(match)
    if stripped_text != '':
        f.write(stripped_text + ' xYx ')
        words = stripped_text.split()
        for word in words:
            word = str(word).lower().translate(None, '",!.?!@#$%^&*()_-:<>')
            POS = mapPOS(word)
            if POS:
                for char in list(POS):
                    if char in fb_dictionary.keys():
                        fb_dictionary[char].append(word)
                    else:
                        fb_dictionary[char] = []
            else:
                unknown.write(word + ', ')
Esempio n. 34
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.links = []
Esempio n. 35
0
def getPaintingGenerator(query=u''):
    '''

    Doing a two step approach here. Could do one, but would be complicated
    * Loop over http://art.famsf.org/search?search_api_views_fulltext=&f[0]=field_art_class%3A684&page=0 - 20 and grab paintings
    * Grab data from paintings
    '''
    # LOL, these nerds start at zero :-)
    baseurl = u'http://art.famsf.org/search?search_api_views_fulltext=&f[0]=field_art_class%%3A684&page=%s'

    htmlparser = HTMLParser.HTMLParser()

    # Have to restart in the end to do the 9
    # 0 - 20
    for i in range(8, 12):
        searchurl = baseurl % (i, )
        print searchurl
        searchPage = urllib2.urlopen(searchurl)
        searchData = searchPage.read()
        # <div class="views-field views-field-title">        <span class="field-content"><a href="/nicolas-maes/portrait-man-19425">Portrait of a Man</a>

        itemregex = u'<div class="views-field views-field-title">\s*<span class="field-content"><a href="([^"]+)">([^<]+)</a>'

        for match in re.finditer(itemregex, searchData):
            url = u'http://art.famsf.org%s' % (match.group(1), )
            title = htmlparser.unescape(unicode(match.group(2), "utf-8"))
            print url

            itemPage = urllib2.urlopen(url)
            itemData = itemPage.read()

            metadata = {}
            metadata['url'] = url
            metadata['title'] = title

            creatorregex = u'<span class="views-label">Artist: </span>\s*<div class="field-content"><a href="([^"]+)">([^<]+)</a></div>'

            dateeregex = u'<div class="views-field views-field-field-art-display-date">\s*<span class="views-label views-label-field-art-display-date">Date:\s*</span>\s*<div class="field-content">([^<]+)</div>\s*</div>'
            locationegex = u'<div class="views-field views-field-field-art-location-calculated">\s*<span class="views-label views-label-field-art-location-calculated">Location:\s*</span>\s*<div class="field-content">([^<]+)</div>\s*</div>'
            mediumregex = u'<div class="views-field views-field-field-art-media">\s*<span class="views-label views-label-field-art-media">Media:\s*</span>\s*<div class="field-content">([^<]+)</div>\s*</div>'
            idregex = u'<div class="views-field views-field-field-art-accession-number">\s*<span class="views-label views-label-field-art-accession-number">Accession Number:\s*</span>\s*<div class="field-content">([^<]+)</div>\s*</div>'
            acquisitiondateregex = u'<div class="views-field views-field-field-art-acquisition-date">\s*<span class="views-label views-label-field-art-acquisition-date">Acquisition Date:\s*</span>\s*<div class="field-content"><span class="date-display-single">(\d\d\d\d-\d\d-\d\d)</span></div>\s*</div>'

            creatormatch = re.search(creatorregex, itemData, flags=re.M)
            if creatormatch:
                metadata[u'creator'] = htmlparser.unescape(
                    unicode(creatormatch.group(2), "utf-8"))
            else:
                # Creator not always available
                metadata[u'creator'] = u'anonymous'

            #titlematch = re.search(titleregex, itemData)
            #metadata[u'title']=htmlparser.unescape(unicode(titlematch.group(1), "utf-8"))

            locationmatch = re.search(locationegex, itemData)
            if locationmatch.group(1) == u'de Young':
                # Ok, it's on view at de Young so let's add that
                metadata[u'location'] = u'Q1470276'
            elif locationmatch.group(1) == u'Legion of Honor':
                # Ok, it's on view at the Legion of Honor so let's add that
                metadata[u'location'] = u'Q2468251'
            else:
                # Where? Let's add the main museum
                metadata[u'location'] = u'Q1416890'

            datematch = re.search(dateeregex, itemData)
            # Not always available
            if datematch:
                metadata[u'date'] = htmlparser.unescape(
                    unicode(datematch.group(1), "utf-8"))

            mediummatch = re.search(mediumregex, itemData)
            # Not always available
            if mediummatch:
                metadata[u'medium'] = htmlparser.unescape(
                    unicode(mediummatch.group(1), "utf-8"))

            idmatch = re.search(idregex, itemData)
            metadata[u'id'] = htmlparser.unescape(
                unicode(idmatch.group(1), "utf-8"))
            if u'?' in metadata[u'id']:
                continue

            acquisitiondatematch = re.search(acquisitiondateregex, itemData)
            if acquisitiondatematch:
                metadata[u'acquisitiondate'] = htmlparser.unescape(
                    unicode(acquisitiondatematch.group(1), "utf-8"))

            yield metadata
        '''
Esempio n. 36
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.article = Article("","")
 

这里还有一个例外情况:

exception HTMLParser.HTMLParserError

当分析遇到 Error 时 HTMLParser 会抛出异常。该异常提供三个属性: msg , lineno and offset 。

 

HTMLParser 实例有如下的方法:

 

HTMLParser.reset()

重置实例 . 所有未处理的数据都会丢失。在初始化时自动调用。

 

HTMLParser.feed(data)

给分析器喂食。在由完整元素构成的情况下工作;不完整数据情况下,会进行缓冲知道更多数据加进来或者 close() 被调用。

 

HTMLParser.close()

处理所有缓冲数据。这个方法可以被派生类重定义,以便在输入结束后处理额外的事情,重定义的版本也要调用 HTMLParser 基类的 close() 方法。
Esempio n. 38
0
def normalizeTextForTagger(text):
    text = text.replace("&amp;", "&")
    text = HTMLParser.HTMLParser().unescape(text)
    return text
Esempio n. 39
0
 def __init__(self, projectDirectory):
     HTMLParser.__init__(self)
     self.projectDirectory = Sourcerer.cleanDirecoryPath(projectDirectory)
Esempio n. 40
0
def html_type_repl(matchobj):
    return HTMLParser.HTMLParser().unescape(matchobj.group())
Esempio n. 41
0
 def reset(self):
     HTMLParser.reset(self)
Esempio n. 42
0
def getTePapaGenerator():
    """
    Generator to return Museum of New Zealand Te Papa Tongarewa paintings
    """

    htmlparser = HTMLParser.HTMLParser()

    count = 12
    basesearchurl = u'http://collections.tepapa.govt.nz/Search/GetObjectThumbnailsShowMoreForAdvanced/?scope=all&imagesOnly=False&downloadable=False&startIndex=%s&returnCount=%s&advanced=colClassification:"paintings"+colCollectionGroup:CH'

    for i in range(1, 1793, count):
        searchurl = basesearchurl % (i, count)
        searchPage = requests.get(searchurl)
        for iteminfo in searchPage.json():
            metadata = {}
            url = u'http://collections.tepapa.govt.nz%s' % (
                iteminfo.get('path'), )

            # Museum site probably doesn't like it when we go fast
            # time.sleep(5)
            pywikibot.output(url)

            metadata['url'] = url

            metadata['collectionqid'] = u'Q915603'
            metadata['collectionshort'] = u'Te Papa'
            metadata['locationqid'] = u'Q915603'

            #No need to check, I'm actually searching for paintings.
            metadata['instanceofqid'] = u'Q3305213'

            title = iteminfo.get('title')

            # Chop chop, in case we have very long titles
            if len(title) > 220:
                title = title[0:200]
            metadata['title'] = {
                u'en': title,
            }
            name = iteminfo.get('colProProductionMakers')

            if not name:
                metadata['creatorqid'] = u'Q4233718'
                metadata['creatorname'] = u'anonymous'
                metadata['description'] = {
                    u'nl': u'schilderij van anonieme schilder',
                    u'en': u'painting by anonymous painter',
                }
            else:
                if u',' in name:
                    (surname, sep, firstname) = name.partition(u',')
                    name = u'%s %s' % (
                        firstname.strip(),
                        surname.strip(),
                    )
                metadata['creatorname'] = name

                metadata['description'] = {
                    u'nl':
                    u'%s van %s' % (
                        u'schilderij',
                        metadata.get('creatorname'),
                    ),
                    u'en':
                    u'%s by %s' % (
                        u'painting',
                        metadata.get('creatorname'),
                    ),
                }

            metadata['inception'] = iteminfo.get('colProProductionDates')

            metadata['idpid'] = u'P217'
            metadata['id'] = iteminfo.get('colRegistrationNumber')

            # Not everything is in json, so some good old parsing
            itempage = requests.get(url)

            mediumregex = u'\<td class\=\"heading\"\>Medium summary\<\/td\>[\s\t\r\n]*\<td\>oil on canvas\<\/td\>'
            mediummatch = re.search(mediumregex, itempage.text)
            if mediummatch:
                metadata['medium'] = u'oil on canvas'

            dimensionsregex = u'\<td class\=\"heading\"\>Dimensions\<\/td\>[\s\t\r\n]*\<td\>[\s\t\r\n]*(Overall|Image):([^\<]+)[\s\t\r\n]*\<br \/\>'

            dimensionsmatch = re.search(dimensionsregex, itempage.text)

            if dimensionsmatch:
                dimensiontext = dimensionsmatch.group(2).strip()
                regex_2d = u'(?P<height>\d+)(mm)?\s*\(Height\)[\s\t\r\n]*x[\s\t\r\n]*(?P<width>\d+)(mm)?\s*\((Width|Length)\).*$'
                regex_3d = u'(?P<height>\d+)(mm)?\s*\(Height\)[\s\t\r\n]*x[\s\t\r\n]*(?P<width>\d+)(mm)?\s*\((Width|Length)\)[\s\t\r\n]*x[\s\t\r\n]*(?P<depth>\d+)(mm)?\s*\(Depth\).*$'
                match_2d = re.match(regex_2d, dimensiontext)
                match_3d = re.match(regex_3d, dimensiontext)
                if match_2d:
                    metadata['heightcm'] = unicode(
                        float(match_2d.group(u'height')) / 10)
                    metadata['widthcm'] = unicode(
                        float(match_2d.group(u'width')) / 10)
                elif match_3d:
                    metadata['heightcm'] = unicode(
                        float(match_3d.group(u'height')) / 10)
                    metadata['widthcm'] = unicode(
                        float(match_3d.group(u'width')) / 10)
                    metadata['depthcm'] = unicode(
                        float(match_3d.group(u'depth')) / 10)

            creditlineregex = u'\<td class\=\"heading\"\>Credit line\<\/td\>[\s\t\r\n]*\<td\>([^\<]+ (?P<year1>\d\d\d\d)|Purchased (?P<year2>\d\d\d\d) [^\<]+)\<\/td\>'
            creditlinematch = re.search(creditlineregex, itempage.text)

            if creditlinematch:
                if creditlinematch.group(u'year1'):
                    metadata['acquisitiondate'] = creditlinematch.group(
                        u'year1')
                elif creditlinematch.group(u'year2'):
                    metadata['acquisitiondate'] = creditlinematch.group(
                        u'year2')

            yield metadata
 def _clean_text(self, text):
     text = HTMLParser.HTMLParser().unescape(text)
     text = re.sub('[\r\n\t]', '', text)
     text = re.sub('>\s+<', '><', text)
     return re.sub('\s+', ' ', text).strip()
Esempio n. 44
0
    def process_layout(layout_schema=None, interactions=None):

        # Load template and find 'body' for template appendation
        env = Environment()
        env.loader = FileSystemLoader(PORTAL_ROOT + '/templates')
        tmpl_unparsed = env.get_template('ion_ux.html').render()
        tmpl = ET.fromstring(tmpl_unparsed.encode('utf-8'))
        body_elmt = tmpl.find('body')

        # Fetch the layout schema
        layout_schema = LayoutApi.get_new_layout_schema()

        # Track resource types, metadata and widgets without processed sub-attributes
        resource_types = []
        metadata_processed = []
        exclude_sub_attributes = ['table_ooi', 'chart_ooi']
        attribute_levels = [
            'level-zero', 'level-one', 'level-two', 'level-three',
            'level-four', 'level-five', 'level-six'
        ]

        # --------------------------------------------------------------------------
        # VIEWS
        # --------------------------------------------------------------------------

        # Loop through defined views and build <script> templates with following heirarchy:
        # view -> groups -> blocks -> attributes -> sub-attributes.
        for view_id in DEFINED_VIEWS:
            view = layout_schema['spec']['elements'][view_id]

            script_elmt = _make_element(body_elmt,
                                        'script',
                                        id=view_id,
                                        type='text/template')

            # heading_elmt = _make_element(script_elmt, 'div', css='row-fluid heading')
            v00_elmt = _make_element(script_elmt, 'div', css='v00 heading')

            content_elmt = _make_element(script_elmt, 'div', css='row-fluid')
            v01_elmt = _make_element(content_elmt, 'div', css='v01 span3')
            v02_elmt = _make_element(content_elmt, 'div', css='v02 span9')

            # --------------------------------------------------------------------------
            # GROUPS
            # --------------------------------------------------------------------------

            # Track groups on per view basis
            groups = {}

            # Loop through groups
            for gr_idx, gr_element in enumerate(view['embed']):
                group_elid = gr_element['elid']
                group_link_id = group_elid + str(randint(0, 1000))
                group_position = gr_element['pos']
                group = layout_schema['spec']['elements'][group_elid]

                # Set the parent element for the group
                if group_position == 'V00':
                    parent_elmt = v00_elmt
                elif group_position == 'V01':
                    parent_elmt = v01_elmt
                else:
                    parent_elmt = v02_elmt

                # LABEL OVERRIDES
                if gr_element.has_key('olabel'):
                    print 'group label override:', group[
                        'label'], '->', gr_element['olabel'], group_elid
                    group_label = gr_element['olabel']
                else:
                    group_label = group['label']

                # CHECK FOR TITLE BAR (V00), creates tabs for V01 and V02 groups
                if group_position == 'V00':
                    group_elmt = parent_elmt
                else:
                    if not group_position in groups.keys():
                        group_container_elmt = _make_element(parent_elmt,
                                                             'div',
                                                             id=group_elid,
                                                             css='group')
                        group_ul_elmt = _make_element(group_container_elmt,
                                                      'ul',
                                                      css='nav nav-tabs')
                        group_block_container_elmt = _make_element(
                            group_container_elmt, 'div', css='tab-content')

                        groups.update({
                            group_position: {
                                'ul_elmt':
                                group_ul_elmt,
                                'group_container_elmt':
                                group_container_elmt,
                                'group_block_container_elmt':
                                group_block_container_elmt
                            }
                        })
                    else:
                        group_ul_elmt = groups[group_position]['ul_elmt']
                        group_block_container_elmt = groups[group_position][
                            'group_block_container_elmt']

                    # <li>, <a> and group element
                    group_li_elmt = _make_element(group_ul_elmt, 'li', css='')
                    group_a_elmt = _make_element(group_li_elmt,
                                                 'a',
                                                 href="#%s" % group_link_id,
                                                 data_toggle='tab',
                                                 content=group_label)
                    group_elmt = _make_element(group_block_container_elmt,
                                               'div',
                                               id=group_link_id,
                                               css='tab-pane row-fluid')

                # --------------------------------------------------------------------------
                # BLOCKS
                # --------------------------------------------------------------------------

                # Loop through blocks
                for bl_element in group['embed']:
                    block_elid = bl_element['elid']
                    block_position = bl_element['pos']
                    block = layout_schema['spec']['elements'][block_elid]

                    block_widget_id = block['wid']
                    block_widget = layout_schema['spec']['widgets'][
                        block_widget_id]
                    block_widget_type = block_widget['name']
                    block_res_type = block['ie']['ie_name'] if block.has_key(
                        'ie') else ''

                    if not block_res_type in resource_types:
                        resource_types.append(block_res_type)

                    # Set li class based on block_res_type
                    if group_position != 'V00':
                        li_css_class = group_li_elmt.get('class')
                        if not block_res_type in li_css_class:
                            li_css_class += ' %s' % block_res_type
                            group_li_elmt.attrib['class'] = li_css_class

                    # LABEL OVERRIDES
                    if bl_element.has_key('olabel'):
                        print 'block label override:', block[
                            'label'], '->', bl_element['olabel'], block_elid
                        block_label = bl_element['olabel']
                    else:
                        block_label = block['label']

                    block_css_class = block_res_type

                    # if not block_res_type in block_css_class:
                    #     block_css_class += ' %s' % block_res_type

                    # BLOCK LAYOUT

                    if block['embed']:
                        for at_element in block['embed']:
                            attribute = layout_schema['spec']['elements'][
                                at_element['elid']]
                            attribute_widget_type = layout_schema['spec'][
                                'widgets'][attribute['wid']]['name']

                            wide_container = True if attribute_widget_type in (
                                'table_ooi', 'chart_ooi') else False

                    if wide_container:
                        block_container = _make_element(group_elmt,
                                                        'div',
                                                        css='row-fluid')
                        block_elmt = _make_element(block_container,
                                                   'div',
                                                   style="display:none;",
                                                   id=block_elid)
                        block_css_class += ' span12'
                    else:
                        block_elmt = _make_element(group_elmt,
                                                   'div',
                                                   style="display:none;",
                                                   id=block_elid)
                        block_css_class += ' block'

                        # Greater than V01
                        if group_position not in ('V00', 'V01'):
                            block_css_class += ' span3'
                        # CHECK FOR TITLE BAR (V00)
                        elif group_position == 'V00':
                            block_css_class += ' row-fluid'

                    block_elmt.attrib['class'] = block_css_class

                    # SET GROUP HEADINGS
                    if group_position != 'V00':
                        # Hide table headers for now.
                        if not attribute_widget_type == 'table_ooi':
                            block_h3_elmt = _make_element(block_elmt,
                                                          'h3',
                                                          content=block_label)
                    if group_position == 'V00':
                        block_container_elmt = block_elmt
                        left_elmt = _make_element(block_container_elmt,
                                                  'div',
                                                  css='span6 heading-left')
                        right_elmt = _make_element(block_container_elmt,
                                                   'div',
                                                   css='span6 heading-right')
                    else:
                        block_container_elmt = _make_element(block_elmt, 'div')

                    # Attributes
                    for at_element in block['embed']:
                        attribute_elid = at_element['elid']
                        attribute_position = at_element['pos']
                        attribute_data_path = at_element['dpath']
                        attribute_level = at_element['olevel']
                        attribute_css = attribute_levels[int(
                            attribute_level)] if attribute_level else ''
                        attribute = layout_schema['spec']['elements'][
                            attribute_elid]
                        attribute_widget_id = attribute['wid']
                        attribute_widget_type = layout_schema['spec'][
                            'widgets'][attribute_widget_id]['name']

                        # LABEL OVERRIDES
                        if at_element.has_key('olabel'):
                            print 'attribute label override:', attribute[
                                'label'], '->', at_element[
                                    'olabel'], attribute_elid
                            attribute_label = at_element['olabel']
                        else:
                            attribute_label = attribute['label']

                        if attribute_widget_type == 'image_ooi':
                            image_class = layout_schema['spec']['graphics'][
                                attribute['gfx']]['name']
                            attribute_css += ' %s %s' % (attribute_widget_type,
                                                         image_class)
                        else:
                            attribute_css += ' %s' % attribute_widget_type

                        # CHECK FOR TITLE BAR
                        if attribute_widget_type not in (
                                'table_ooi',
                                'chart_ooi') and group_position != 'V00':
                            block_container_elmt.set('class',
                                                     'content-wrapper')

                        attribute_options = {
                            'id': attribute_elid,
                            'data-position': attribute_position,
                            'data-path': attribute_data_path,
                            'data-level': attribute_level,
                            'data-label': attribute_label,
                            'css': attribute_css
                        }

                        if group_position == 'V00':
                            if attribute_position == 'B01' or attribute_position == 'B02':
                                attribute_elmt = _make_element(
                                    left_elmt, 'div', **attribute_options)
                            else:
                                attribute_elmt = _make_element(
                                    right_elmt, 'div', **attribute_options)
                        else:
                            attribute_elmt = _make_element(
                                block_container_elmt, 'div',
                                **attribute_options)

                        # FOR INTEGRATION
                        # if UI_MODE == 'DEVELOPMENT':
                        #     attribute_elmt.text = 'Attribute: %s (%s) (%s) (%s) (%s)' % (attribute['label'], attribute['name'], attribute_elid, attribute_widget_type, attribute_position)

                        # Generate metadata for nested elements, ex. tables and attribute groups
                        if attribute_widget_type in (
                                'table_ooi', 'attribute_group_ooi'
                        ) and attribute_elid not in metadata_processed:
                            metadata_processed.append(attribute_elid)
                            metadata = []
                            for embedded_attribute in attribute['embed']:
                                embedded_object = layout_schema['spec'][
                                    'elements'][embedded_attribute['elid']]
                                embedded_widget_type = layout_schema['spec'][
                                    'widgets'][
                                        embedded_attribute['wid']]['name']

                                # LABEL OVERRIDE
                                if embedded_attribute.has_key('olabel'):
                                    print 'sub-attribute label override:', embedded_object[
                                        'label'], '->', embedded_attribute[
                                            'olabel'], attribute_elid
                                    embedded_object_label = embedded_attribute[
                                        'olabel']
                                else:
                                    embedded_object_label = embedded_object[
                                        'label']

                                embedded_info_level = embedded_attribute[
                                    'olevel']
                                if embedded_info_level:
                                    embedded_info_level_index = int(
                                        embedded_info_level)

                                metadata_items = [
                                    embedded_widget_type,
                                    embedded_object_label,
                                    embedded_attribute['dpath'],
                                    embedded_attribute['pos'],
                                    embedded_info_level,
                                    attribute_levels[embedded_info_level_index]
                                ]
                                if attribute_widget_type == 'attribute_group_ooi':
                                    meta_elmt_id = 'ATTRIBUTE_GROUP_' + attribute_elid
                                    metadata_items.append(
                                        embedded_attribute['elid'])
                                    metadata_items.append(
                                        embedded_attribute['dpath'])
                                elif attribute_widget_type == 'table_ooi':
                                    meta_elmt_id = 'TABLE_' + attribute_elid

                                metadata.append(metadata_items)

                            # Append metadata to body as a JSON script
                            meta_elmt = ET.SubElement(body_elmt, 'script')
                            meta_elmt.set('id', meta_elmt_id)
                            meta_elmt.text = "var %s=%s" % (
                                meta_elmt_id, json.dumps(metadata))

        layout_elmt = ET.SubElement(body_elmt, 'script')
        layout_elmt.set('id', 'layout')
        layout_elmt.text = "var LAYOUT=%s;" % json.dumps(layout_schema)

        resource_types_elmt = ET.SubElement(body_elmt, 'script')
        resource_types_elmt.set('id', 'resource_types')
        resource_types_elmt.text = "var RESOURCE_TYPES=%s" % json.dumps(
            resource_types)

        init_script_elmt = ET.Element('script')
        init_script_elmt.set('type', 'text/javascript')
        init_script_elmt.text = "$(function(){initialize_app();});"
        body_elmt.append(init_script_elmt)

        tmpl = ET.tostring(tmpl)
        tmpl = '<!DOCTYPE html>\n' + tmpl

        h = HTMLParser.HTMLParser()
        return h.unescape(tmpl)
Esempio n. 45
0
 def __init__(self):        
     HTMLParser.__init__(self) # http://stackoverflow.com/a/9698750
     self.start_title=0
     self.title = ''
     self.stop_title=0
Esempio n. 46
0
def unescape(html):
    global html_parser
    if not html_parser: html_parser = HTMLParser.HTMLParser()
    return html_parser.unescape(html)
Esempio n. 47
0
	def __init__(self, idMembro, cvLattesHTML):
		HTMLParser.__init__(self)

		# inicializacao obrigatoria
		self.idMembro = idMembro
		self.sexo = 'Masculino'
		self.nomeCompleto = u'[Nome-nao-identificado]'

		self.item = ''
		self.issn = ''
		self.listaIDLattesColaboradores = []
		self.listaFormacaoAcademica = []
		self.listaProjetoDePesquisa = []
		self.listaAreaDeAtuacao = []
		self.listaIdioma = []
		self.listaPremioOuTitulo = []

		self.listaArtigoEmPeriodico = []
		self.listaLivroPublicado = []
		self.listaCapituloDeLivroPublicado = []
		self.listaTextoEmJornalDeNoticia = []
		self.listaTrabalhoCompletoEmCongresso = []
		self.listaResumoExpandidoEmCongresso = []
		self.listaResumoEmCongresso = []
		self.listaArtigoAceito = []
		self.listaApresentacaoDeTrabalho = []
		self.listaOutroTipoDeProducaoBibliografica = []

		self.listaSoftwareComPatente = []
		self.listaSoftwareSemPatente = []
		self.listaProdutoTecnologico = []
		self.listaProcessoOuTecnica = []
		self.listaTrabalhoTecnico = []
		self.listaOutroTipoDeProducaoTecnica = []

		self.listaPatente = []
		self.listaProgramaComputador = []
		self.listaDesenhoIndustrial = []
				
		self.listaProducaoArtistica = []

		self.listaOASupervisaoDePosDoutorado = []
		self.listaOATeseDeDoutorado = []
		self.listaOADissertacaoDeMestrado = []
		self.listaOAMonografiaDeEspecializacao = []
		self.listaOATCC = []
		self.listaOAIniciacaoCientifica = []
		self.listaOAOutroTipoDeOrientacao = []

		self.listaOCSupervisaoDePosDoutorado = []
		self.listaOCTeseDeDoutorado = []
		self.listaOCDissertacaoDeMestrado = []
		self.listaOCMonografiaDeEspecializacao = []
		self.listaOCTCC = []
		self.listaOCIniciacaoCientifica = []
		self.listaOCOutroTipoDeOrientacao = []

		self.listaParticipacaoEmEvento = []
		self.listaOrganizacaoDeEvento = []


		# inicializacao para evitar a busca exaustiva de algumas palavras-chave
		self.salvarAtualizacaoCV = 1 
		self.salvarFoto = 1
		self.procurarCabecalho = 0
		self.achouGrupo = 0
		self.doi = ''
		self.relevante = 0
		self.idOrientando = ''
		self.complemento = ''

		# contornamos alguns erros do HTML da Plataforma Lattes
		cvLattesHTML = cvLattesHTML.replace("<![CDATA[","")
		cvLattesHTML = cvLattesHTML.replace("]]>","")
		cvLattesHTML = cvLattesHTML.replace("<x<","&lt;x&lt;")
		cvLattesHTML = cvLattesHTML.replace("<X<","&lt;X&lt;")

		# feed it!
		try:
			cvLattesHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities':1})
		except UnicodeDecodeError, e:
			# For some reason, pytidylib fails to decode, whereas the
			# original html content converts perfectly manually.
			print e
			cvLattesHTML, errors = tidy_document(cvLattesHTML.encode('utf-8'), options={'numeric-entities':1})
			document = document.decode('utf-8')
Esempio n. 48
0
def push_message(request):
    if request.method != "POST":
        raise Http404
    title = request.POST.get('title', None)
    body = request.POST.get('body', None)
    url_args = request.POST.get('url_args', '')
    account_key = request.POST.get('account_key', None)
    account_keys = request.POST.getlist("account_keys", None)
    scheduled_at = request.POST.get('scheduled_at', None)
    if not scheduled_at or len(scheduled_at) == 0:
        scheduled_at = None
    segments = request.POST.getlist('send_to_segments', None)
    if not segments or len(segments) == 0:
        segments = None
    segments_string = request.POST.get('send_to_segments_string', None)
    if segments_string:
        temp_segments = segments_string.split(",")
        if len(temp_segments):
            segments = temp_segments
    if not title:
        raise Exception("Submitted title is empty. Body: " + body)
    if not body:
        raise Exception("Submitted body is empty. Title: " + title)
    if not account_key and not account_keys:
        raise Exception("Submitted Account Key is empty. Title: " + title)
    if scheduled_at:
        scheduled_at = datetime.strptime(scheduled_at, '%m/%d/%Y %H:%M %p')
    custom = request.POST.get('custom', False)
    if custom:
        custom = True
    image = request.FILES.get('image', None)
    h = HTMLParser.HTMLParser()
    title = h.unescape(title)
    title = title.encode('utf-8', 'ignore').strip(' \n\r')
    truncate_title = lambda data: len(data) > 40 and data[:40] + '...' or data
    title = truncate_title(title)
    body = h.unescape(body)
    body = body.encode('utf-8', 'ignore').strip(' \n\r')
    truncate_body = lambda data: len(data) > 100 and data[:100] + '...' or data
    body = truncate_body(body)

    should_push = False
    comment = ''
    command_path = settings.SUBPROCESS_COMMAND_PATH
    if account_key:
        try:
            profile = ClientProfile.objects.get(account_key=account_key,
                                                status='active')
            try:
                plan = Plan.objects.exclude(type=plans.NONE).exclude(
                    status='expired').filter(
                        user=profile.user,
                        status='active').latest('created_at')
                sent_notifications = PushMessage.objects.sent_notifications_count(
                    account_key=account_key)
                should_push = True
                if sent_notifications >= plan.number_of_notifications:
                    should_push = False
                    comment = 'Notifications number for plan exceeded.'
            except Plan.DoesNotExist:
                comment = 'No price plan for user_id: ' + str(profile.user.id)
        except ClientProfile.DoesNotExist:
            comment = 'No user for this account key or profile is not active.'
        if not should_push:
            try:
                website = Website.objects.get(account_key=account_key)
                comment = ''
                should_push = True
            except Website.DoesNotExist:
                comment = 'No user for this account key or profile is not active or no website cluster.'
        new_message = PushMessage.objects.create(title=title,
                                                 body=body,
                                                 url_args=url_args,
                                                 account_key=account_key,
                                                 custom=custom,
                                                 comment=comment,
                                                 scheduled_at=scheduled_at,
                                                 image=image)
        if segments:
            for segment in Segment.objects.filter(id__in=segments):
                new_message.segments.add(segment)
                new_message.save()
        if should_push and scheduled_at:
            should_push = False
        if should_push:
            # subprocess for async execution
            subprocess.Popen("sleep 10; python " + command_path + " " +
                             str(new_message.id),
                             shell=True)
    elif account_keys:
        profiles = ClientProfile.objects.filter(account_key__in=account_keys,
                                                status='active')
        print(profiles)
        for p in profiles:
            notif = PushMessage.objects.create(title=title,
                                               body=body,
                                               url_args=url_args,
                                               account_key=p.account_key,
                                               custom=custom,
                                               comment=comment,
                                               scheduled_at=scheduled_at,
                                               image=image)
            print(notif)
            print(notif.id)
            if segments:
                for segment in Segment.objects.filter(id__in=segments):
                    notif.segments.add(segment)
                    notif.save()
            if not scheduled_at:
                subprocess.Popen("sleep 10; python " + command_path + " " +
                                 str(notif.id),
                                 shell=True)
        websites = Website.objects.filter(account_key__in=account_keys)
        for w in websites:
            notif = PushMessage.objects.create(title=title,
                                               body=body,
                                               url_args=url_args,
                                               account_key=w.account_key,
                                               custom=custom,
                                               comment=comment,
                                               scheduled_at=scheduled_at,
                                               image=image)
            if segments:
                for segment in Segment.objects.filter(id__in=segments):
                    notif.segments.add(segment)
                    notif.save()
            if not scheduled_at:
                subprocess.Popen("sleep 10; python " + command_path + " " +
                                 str(notif.id),
                                 shell=True)
    return render_to_response('pushmonkey/pushed.html')
Esempio n. 49
0
def getRegexParsed(
        regexs,
        url,
        cookieJar=None,
        forCookieJarOnly=False,
        recursiveCall=False,
        cachedPages={},
        rawPost=False,
        cookie_jar_file=None):  #0,1,2 = URL, regexOnly, CookieJarOnly
    #cachedPages = {}
    #print 'url',url
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    #        print 'doRegexs',doRegexs,regexs
    setresolved = True
    for k in doRegexs:
        if k in regexs:
            #print 'processing ' ,k
            m = regexs[k]
            #print m
            cookieJarParam = False
            if 'cookiejar' in m:  # so either create or reuse existing jar
                #print 'cookiejar exists',m['cookiejar']
                cookieJarParam = m['cookiejar']
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, m['cookiejar'],
                                               cookieJar, True, True,
                                               cachedPages)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            #print 'm[cookiejar]',m['cookiejar'],cookieJar
            if cookieJarParam:
                if cookieJar == None:
                    #print 'create cookie jar'
                    cookie_jar_file = None
                    if 'open[' in m['cookiejar']:
                        cookie_jar_file = m['cookiejar'].split(
                            'open[')[1].split(']')[0]
#                            print 'cookieJar from file name',cookie_jar_file

                    cookieJar = getCookieJar(cookie_jar_file)
                    #                        print 'cookieJar from file',cookieJar
                    if cookie_jar_file:
                        saveCookieJar(cookieJar, cookie_jar_file)
                    #import cookielib
                    #cookieJar = cookielib.LWPCookieJar()
                    #print 'cookieJar new',cookieJar
                elif 'save[' in m['cookiejar']:
                    cookie_jar_file = m['cookiejar'].split('save[')[1].split(
                        ']')[0]
                    complete_path = os.path.join(profile, cookie_jar_file)
                    #                        print 'complete_path',complete_path
                    saveCookieJar(cookieJar, cookie_jar_file)

            if m['page'] and '$doregex' in m['page']:
                pg = getRegexParsed(regexs,
                                    m['page'],
                                    cookieJar,
                                    recursiveCall=True,
                                    cachedPages=cachedPages)
                if len(pg) == 0:
                    pg = 'http://regexfailed'
                m['page'] = pg

            if 'setcookie' in m and m['setcookie'] and '$doregex' in m[
                    'setcookie']:
                m['setcookie'] = getRegexParsed(regexs,
                                                m['setcookie'],
                                                cookieJar,
                                                recursiveCall=True,
                                                cachedPages=cachedPages)
            if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[
                    'appendcookie']:
                m['appendcookie'] = getRegexParsed(regexs,
                                                   m['appendcookie'],
                                                   cookieJar,
                                                   recursiveCall=True,
                                                   cachedPages=cachedPages)

            if 'post' in m and '$doregex' in m['post']:
                m['post'] = getRegexParsed(regexs,
                                           m['post'],
                                           cookieJar,
                                           recursiveCall=True,
                                           cachedPages=cachedPages)
#                    print 'post is now',m['post']

            if 'rawpost' in m and '$doregex' in m['rawpost']:
                m['rawpost'] = getRegexParsed(regexs,
                                              m['rawpost'],
                                              cookieJar,
                                              recursiveCall=True,
                                              cachedPages=cachedPages,
                                              rawPost=True)
                #print 'rawpost is now',m['rawpost']

            if 'rawpost' in m and '$epoctime$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime$',
                                                    getEpocTime())

            if 'rawpost' in m and '$epoctime2$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime2$',
                                                    getEpocTime2())

            link = ''
            if m['page'] and m[
                    'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False:
                #print 'using cache page',m['page']
                link = cachedPages[m['page']]
            else:
                if m['page'] and not m['page'] == '' and m['page'].startswith(
                        'http'):
                    if '$epoctime$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime$',
                                                      getEpocTime())
                    if '$epoctime2$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime2$',
                                                      getEpocTime2())

                    #print 'Ingoring Cache',m['page']
                    page_split = m['page'].split('|')
                    pageUrl = page_split[0]
                    header_in_page = None
                    if len(page_split) > 1:
                        header_in_page = page_split[1]

#                            if
#                            proxy = urllib2.ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse})
#                            opener = urllib2.build_opener(proxy)
#                            urllib2.install_opener(opener)

#                        import urllib2
#                        print 'urllib2.getproxies',urllib2.getproxies()
                    current_proxies = urllib2.ProxyHandler(
                        urllib2.getproxies())

                    #print 'getting pageUrl',pageUrl
                    req = urllib2.Request(pageUrl)
                    if 'proxy' in m:
                        proxytouse = m['proxy']
                        #                            print 'proxytouse',proxytouse
                        #                            urllib2.getproxies= lambda: {}
                        if pageUrl[:5] == "https":
                            proxy = urllib2.ProxyHandler({'https': proxytouse})
                            #req.set_proxy(proxytouse, 'https')
                        else:
                            proxy = urllib2.ProxyHandler({'http': proxytouse})
                            #req.set_proxy(proxytouse, 'http')
                        opener = urllib2.build_opener(proxy)
                        urllib2.install_opener(opener)

                    req.add_header(
                        'User-Agent',
                        'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'
                    )
                    proxytouse = None

                    if 'referer' in m:
                        req.add_header('Referer', m['referer'])
                    if 'accept' in m:
                        req.add_header('Accept', m['accept'])
                    if 'agent' in m:
                        req.add_header('User-agent', m['agent'])
                    if 'x-req' in m:
                        req.add_header('X-Requested-With', m['x-req'])
                    if 'x-addr' in m:
                        req.add_header('x-addr', m['x-addr'])
                    if 'x-forward' in m:
                        req.add_header('X-Forwarded-For', m['x-forward'])
                    if 'setcookie' in m:
                        #                            print 'adding cookie',m['setcookie']
                        req.add_header('Cookie', m['setcookie'])
                    if 'appendcookie' in m:
                        #                            print 'appending cookie to cookiejar',m['appendcookie']
                        cookiestoApend = m['appendcookie']
                        cookiestoApend = cookiestoApend.split(';')
                        for h in cookiestoApend:
                            n, v = h.split('=')
                            w, n = n.split(':')
                            ck = cookielib.Cookie(version=0,
                                                  name=n,
                                                  value=v,
                                                  port=None,
                                                  port_specified=False,
                                                  domain=w,
                                                  domain_specified=False,
                                                  domain_initial_dot=False,
                                                  path='/',
                                                  path_specified=True,
                                                  secure=False,
                                                  expires=None,
                                                  discard=True,
                                                  comment=None,
                                                  comment_url=None,
                                                  rest={'HttpOnly': None},
                                                  rfc2109=False)
                            cookieJar.set_cookie(ck)
                    if 'origin' in m:
                        req.add_header('Origin', m['origin'])
                    if header_in_page:
                        header_in_page = header_in_page.split('&')
                        for h in header_in_page:
                            n, v = h.split('=')
                            req.add_header(n, v)

                    if not cookieJar == None:
                        #                            print 'cookieJarVal',cookieJar
                        cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)
                        opener = urllib2.build_opener(
                            cookie_handler, urllib2.HTTPBasicAuthHandler(),
                            urllib2.HTTPHandler())
                        opener = urllib2.install_opener(opener)
                        #                            print 'noredirect','noredirect' in m

                        if 'noredirect' in m:
                            opener = urllib2.build_opener(
                                cookie_handler, NoRedirection,
                                urllib2.HTTPBasicAuthHandler(),
                                urllib2.HTTPHandler())
                            opener = urllib2.install_opener(opener)
                    elif 'noredirect' in m:
                        opener = urllib2.build_opener(
                            NoRedirection, urllib2.HTTPBasicAuthHandler(),
                            urllib2.HTTPHandler())
                        opener = urllib2.install_opener(opener)

                    if 'connection' in m:
                        #                            print '..........................connection//////.',m['connection']
                        from keepalive import HTTPHandler
                        keepalive_handler = HTTPHandler()
                        opener = urllib2.build_opener(keepalive_handler)
                        urllib2.install_opener(opener)

                    #print 'after cookie jar'
                    post = None

                    if 'post' in m:
                        postData = m['post']
                        #if '$LiveStreamRecaptcha' in postData:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #        postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield)
                        splitpost = postData.split(',')
                        post = {}
                        for p in splitpost:
                            n = p.split(':')[0]
                            v = p.split(':')[1]
                            post[n] = v
                        post = urllib.urlencode(post)

                    if 'rawpost' in m:
                        post = m['rawpost']
                        #if '$LiveStreamRecaptcha' in post:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #       post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield)
                    link = ''
                    try:

                        if post:
                            response = urllib2.urlopen(req, post)
                        else:
                            response = urllib2.urlopen(req)
                        if response.info().get('Content-Encoding') == 'gzip':
                            from StringIO import StringIO
                            import gzip
                            buf = StringIO(response.read())
                            f = gzip.GzipFile(fileobj=buf)
                            link = f.read()
                        else:
                            link = response.read()

                        if 'proxy' in m and not current_proxies is None:
                            urllib2.install_opener(
                                urllib2.build_opener(current_proxies))

                        link = javascriptUnEscape(link)
                        #print repr(link)
                        #print link This just print whole webpage in LOG
                        if 'includeheaders' in m:
                            #link+=str(response.headers.get('Set-Cookie'))
                            link += '$$HEADERS_START$$:'
                            for b in response.headers:
                                link += b + ':' + response.headers.get(
                                    b) + '\n'
                            link += '$$HEADERS_END$$:'

#                        print link

                        response.close()
                    except:
                        pass
                    cachedPages[m['page']] = link
                    #print link
                    #print 'store link for',m['page'],forCookieJarOnly

                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                elif m['page'] and not m['page'].startswith('http'):
                    if m['page'].startswith('$pyFunction:'):
                        val = doEval(m['page'].split('$pyFunction:')[1], '',
                                     cookieJar, m)
                        if forCookieJarOnly:
                            return cookieJar  # do nothing
                        link = val
                        link = javascriptUnEscape(link)
                    else:
                        link = m['page']

            if '$doregex' in m['expres']:
                m['expres'] = getRegexParsed(regexs,
                                             m['expres'],
                                             cookieJar,
                                             recursiveCall=True,
                                             cachedPages=cachedPages)

            if not m['expres'] == '':
                #print 'doing it ',m['expres']
                if '$LiveStreamCaptcha' in m['expres']:
                    val = askCaptcha(m, link, cookieJar)
                    #print 'url and val',url,val
                    url = url.replace("$doregex[" + k + "]", val)

                elif m['expres'].startswith(
                        '$pyFunction:') or '#$pyFunction' in m['expres']:
                    #print 'expeeeeeeeeeeeeeeeeeee',m['expres']
                    val = ''
                    if m['expres'].startswith('$pyFunction:'):
                        val = doEval(m['expres'].split('$pyFunction:')[1],
                                     link, cookieJar, m)
                    else:
                        val = doEvalFunction(m['expres'], link, cookieJar, m)
                    if 'ActivateWindow' in m['expres']: return
                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        return listrepeat, eval(val), m, regexs, cookieJar

                    try:
                        url = url.replace(u"$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                else:
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        ret = re.findall(m['expres'], link)
                        return listrepeat, ret, m, regexs

                    val = ''
                    if not link == '':
                        #print 'link',link
                        reg = re.compile(m['expres']).search(link)
                        try:
                            val = reg.group(1).strip()
                        except:
                            traceback.print_exc()
                    elif m['page'] == '' or m['page'] == None:
                        val = m['expres']

                    if rawPost:
                        #                            print 'rawpost'
                        val = urllib.quote_plus(val)
                    if 'htmlunescape' in m:
                        #val=urllib.unquote_plus(val)
                        import HTMLParser
                        val = HTMLParser.HTMLParser().unescape(val)
                    try:
                        url = url.replace("$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                    #print 'ur',url
                    #return val
            else:
                url = url.replace("$doregex[" + k + "]", '')
    if '$epoctime$' in url:
        url = url.replace('$epoctime$', getEpocTime())
    if '$epoctime2$' in url:
        url = url.replace('$epoctime2$', getEpocTime2())

    if '$GUID$' in url:
        import uuid
        url = url.replace('$GUID$', str(uuid.uuid1()).upper())
    if '$get_cookies$' in url:
        url = url.replace('$get_cookies$', getCookiesString(cookieJar))

    if recursiveCall: return url
    #print 'final url',repr(url)
    if url == "":
        return
    else:
        return url, setresolved
Esempio n. 50
0
import urllib
import re
import threading
import HTMLParser

concmd = ['/load_blacklist']

blacklist_lock = threading.Lock()
blacklist = None

html_unescape = HTMLParser.HTMLParser().unescape

def load_blacklist():
	global blacklist, blacklist_lock
	blacklist_lock.acquire()
	blacklist = []
	
	f = open("blacklist.txt", 'r')
	
	for line in f:
		while line != '' and  line[-1] == '\n':
			line = line[:-1]
		if line != '':
			blacklist.append(re.compile('^' + line + '$'))
	
	f.close()
	blacklist_lock.release()

def matchprotocol(string, protocol):
	return len(protocol) <= len(string) and string[:len(protocol)] == protocol
	def __init__(self, idMembro, cvLattesHTML):
		HTMLParser.__init__(self)

		# inicializacao obrigatoria
		self.idMembro = idMembro
		self.sexo = 'Masculino'

		self.item = ''
		self.listaIDLattesColaboradores = []
		self.listaFormacaoAcademica = []
		self.listaProjetoDePesquisa = []
		self.listaAreaDeAtuacao = []
		self.listaIdioma = []
		self.listaPremioOuTitulo = []

		self.listaArtigoEmPeriodico = []
		self.listaLivroPublicado = []
		self.listaCapituloDeLivroPublicado = []
		self.listaTextoEmJornalDeNoticia = []
		self.listaTrabalhoCompletoEmCongresso = []
		self.listaResumoExpandidoEmCongresso = []
		self.listaResumoEmCongresso = []
		self.listaArtigoAceito = []
		self.listaApresentacaoDeTrabalho = []
		self.listaOutroTipoDeProducaoBibliografica = []

		self.listaSoftwareComPatente = []
		self.listaSoftwareSemPatente = []
		self.listaProdutoTecnologico = []
		self.listaProcessoOuTecnica = []
		self.listaTrabalhoTecnico = []
		self.listaOutroTipoDeProducaoTecnica = []
		self.listaProducaoArtistica = []

		self.listaOASupervisaoDePosDoutorado = []
		self.listaOATeseDeDoutorado = []
		self.listaOADissertacaoDeMestrado = []
		self.listaOAMonografiaDeEspecializacao = []
		self.listaOATCC = []
		self.listaOAIniciacaoCientifica = []
		self.listaOAOutroTipoDeOrientacao = []

		self.listaOCSupervisaoDePosDoutorado = []
		self.listaOCTeseDeDoutorado = []
		self.listaOCDissertacaoDeMestrado = []
		self.listaOCMonografiaDeEspecializacao = []
		self.listaOCTCC = []
		self.listaOCIniciacaoCientifica = []
		self.listaOCOutroTipoDeOrientacao = []

		self.listaParticipacaoEmEvento = []
		self.listaOrganizacaoDeEvento = []


		# inicializacao para evitar a busca exaustiva de algumas palavras-chave
		self.salvarAtualizacaoCV = 1 
		self.salvarFoto = 1
		self.procurarCabecalho = 0
		self.achouGrupo = 0
		self.doi = ''
		self.relevante = 0
		self.idOrientando = ''

		# contornamos alguns erros do HTML da Plataforma Lattes
		cvLattesHTML = cvLattesHTML.replace("<![CDATA[","")
		cvLattesHTML = cvLattesHTML.replace("]]>","")

		# feed it!
		cvLattesHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities':1})
		#print errors
		#print cvLattesHTML.encode("utf8")

		## tentativa errada (não previsível)
		# options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0)
		# cvLattesHTML = str(tidy.parseString(cvLattesHTML, **options)).decode("utf8")

		self.feed(cvLattesHTML)
Esempio n. 52
0
 def feed(self, data):
     HTMLParser.feed(self, data)
     return self.root