Python MyHTMLParser.feed Examples, MyHTMLParser.MyHTMLParser.feed Python Examples

Example #1

0

Show file

File: NorroenDyrd.py Project: freysgodi/NorroenDyrd

    def setIndex():
        from MyHTMLParser import MyHTMLParser
        import os

        HTMLlist = []
        nobody = [
            "index.html", "index1.html", "index2.html", "index3.html",
            "index4.html", "ru3.html", "ru2.html", "ru1.html", "rut.html",
            "rus.html", "ru.html"
        ]
        for (parent, d, f) in os.walk(NorroenDyrd.mirror):
            for fn in f:
                if fn in nobody:
                    continue
                elif fn.find(".html") == -1:
                    continue
                elif os.path.join(parent, fn) in HTMLlist:
                    continue
                else:
                    HTMLlist.append(os.path.join(parent, fn))
        html = []
        for h in HTMLlist:
            entry = {}
            with open(h, "r", encoding="utf-8") as f:
                html = f.readlines()
            parser = MyHTMLParser()
            for i in html:
                parser.feed(i)
            entry["path"] = h.replace(NorroenDyrd.mirror, NorroenDyrd.base)
            entry["text"] = parser.plaintext
            entry["title"] = parser.title
            NorroenDyrd.index.append(entry)
            del parser

Example #2

0

Show file

class Inverter():
    #global doc_id, term_count

    def __init__(self, config):
        self.config = config
        self.ht = myhashtable(config)
        self.htmlparser = MyHTMLParser(self.config, self.ht)
        self.start_batch_processing()
        self.write_file_map()
        self.ht.write_posting_file(term_count)
        self.ht.write_hash_table()

    def start_batch_processing(self):
        file_id = 0

        for in_file in os.listdir(self.config['str_src_dir']):
            #if in_file not in ['medium.html','simple.html']: continue #for testing
            with open(self.config['str_src_dir'] + in_file, 'r') as f:
                doc_id[file_id] = in_file
                term_count[file_id] = 0
                self.htmlparser.feed(f.read(), file_id)
                file_id += 1

    # writing doc_id <--> doc_name file
    def write_file_map(self):
        #writing document id file
        with open(
                self.config['str_dst_dir'] +
                self.config['str_doc_id_file_name'], 'wb+') as f:
            for did, txt in doc_id.iteritems():
                f.write('{0:0>{1}d} {2:'
                        '<{3}s}\n'.format(did,
                                          self.config['file_id_encoding_len'],
                                          txt, self.config['file_name_len']))

Example #3

0

Show file

File: main.py Project: charsyam/pythoncrawl

def f(idx, q,r):
    path = "data%s"%(idx)
    os.makedirs(path)
    while True:
        item = q.get()
        if( item.item_type == ITEM_QUIT ):
            break;

        count = 0
        localQueue = Queue()
        current = item.data
        while True:
            print current
            fo = urlopen(current)
            data = fo.read()
            name = "%s/%s"%(path,count)
            fw = open( name, "w" )
            count = count + 1
            fw.write(data)
            fw.close()
            fo.close()
            p = MyHTMLParser()
            try:
                p.feed(data)
            except:
                pass

            for href in p.hrefs:
                print item.data, ": ", href

            try:
                current = localQueue.get_nowait()
            except:
                break;

Example #4

0

Show file

File: boto_funcs.py Project: Yanivba18/chatbot

def create_journey_instructions(steps):
    parser = MyHTMLParser()  # HTML parser for directions API data
    instruct = ""
    for step in steps:
        parser.feed(step['html_instructions'])
        instruct += parser.get_data() + ">>>>>"
    print(instruct)
    return instruct

Example #5

0

Show file

def table_maker(pd_row):
    #read data from html-like file
    h = MyHTMLParser()
    h.feed(pd_row['data'])
    soup = soupparser()
    p = soup.handle_data(pd_row)
    dividendpershare = [h.type_dividendpershare, h.asofdate_dividendpershare, h.reporttype_dividendpershare,
                        h.period_dividendpershare, h.currency_dividendpershare, p.data_dividendpershare,
                        h.exdate_dividendpershare, h.recorddate_dividendpershare, h.paydate_dividendpershare,
                        h.declarationdate_dividendpershare]
    totalrevenue = [h.type_totalrevenue, h.asofdate_totalrevenue, h.reporttype_totalrevenue,
                    h.period_totalrevenue, h.currency_totalrevenue, p.data_totalrevenue,
                    h.exdate_totalrevenue, h.recorddate_totalrevenue, h.paydate_totalrevenue,
                    h.declarationdate_totalrevenue]
    dividend = [h.type_dividend, h.asofdate_dividend, h.reporttype_dividend,
                h.period_dividend, h.currency_dividend, p.data_dividend,
                h.exdate_dividend, h.recorddate_dividend, h.paydate_dividend,
                h.declarationdate_dividend]
    eps = [h.type_eps, h.asofdate_eps, h.reporttype_eps,
           h.period_eps, h.currency_eps, p.data_eps,
           h.exdate_eps, h.recorddate_eps, h.paydate_eps, h.declarationdate_eps]

    #sort data and make it into a dataframe
    names = ['type', 'asofdate', 'reporttype', 'period', 'currency','data',
             'exdate', 'recorddate', 'paydate', 'declarationdate']
    def make_dataframe(list1):
        dict1 = {names[i]: list1[i] for i in range(10)}
        dataframe1 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in dict1.items()]))
        dataframe1 = dataframe1.fillna(method='ffill')
        return dataframe1

    dividendpershare_dataframe = make_dataframe(dividendpershare)
    totalrevenue_dataframe = make_dataframe(totalrevenue)
    dividend_dataframe = make_dataframe(dividend)
    eps_dataframe = make_dataframe(eps)

    table1 = pd.concat([dividendpershare_dataframe, totalrevenue_dataframe,
                        dividend_dataframe, eps_dataframe], axis = 0, ignore_index=True)

    reqId1 = [pd_row['reqId']] * len(table1['type'])
    table1['reqId'] = pd.Series(np.array(reqId1), index = table1.index)

    #format each column to put into sql
    table1['type'] = table1['type'].astype(str)
    table1['reporttype'] = table1['reporttype'].astype(str)
    table1['period'] = table1['period'].astype(str)
    table1['asofdate'] = pd.to_datetime(table1['asofdate'])
    table1['exdate'] = pd.to_datetime(table1['exdate'])
    table1['recorddate'] = pd.to_datetime(table1['recorddate'])
    table1['paydate'] = pd.to_datetime(table1['paydate'])
    table1['declarationdate'] = pd.to_datetime(table1['declarationdate'])

    #drop_duplicate line
    table1 = table1.drop_duplicates()

    return table1

Example #6

0

Show file

File: Facade.py Project: hrkg/pycurl

    def downloadPictures():

        curl = MyCurl()
        curl.set_url(MyUriEncode.getUrl())
        
        buffer = BytesIO()
        buffer = curl.set_buffer(buffer)
        
        curl.exec()
        curl.close()
        
        body = buffer.getvalue().decode('utf-8')
        
        parser = MyHTMLParser() 
        parser.feed(body)

Example #7

0

Show file

File: main.py Project: SecT/BulbapediaApp

def getListOfPokemonPages():

    pokemonListAddress = "http://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"

    pokeListResponse = urllib.request.urlopen(pokemonListAddress)
    pokeListPage = str(pokeListResponse.read())

    parser = MyHTMLParser()
    parser.feed(pokeListPage)

    baseBulbapediaAdress = "http://bulbapedia.bulbagarden.net"

    for i, link in enumerate(parser.pokeListParser.pokemonURLs):
        parser.pokeListParser.pokemonURLs[i] = baseBulbapediaAdress + link

    return  parser.pokeListParser.pokemonURLs

Example #8

0

Show file

File: RunCrawler.py Project: liuhantao9/Information_Retrieval

    def crawl(self, depth, frontier):
        if depth > self.maxdepth:
            return

        nextLevelFrontier = list()
        for url in frontier:
            # only parse when the number of crawled pages are not exceeding maximum
            if len(self.crawledlist
                   ) < self.numPages and url not in self.crawledlist:
                # pass in the URL and create the request
                request = req.Request(
                    url,
                    headers={
                        "User-Agent":
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
                    })
                try:
                    time.sleep(1)
                    # send the request to the url and get the response
                    data = req.urlopen(request).read().decode("utf-8")
                    parser = MyHTMLParser()
                    parser.feed(data)
                    # Handling Nonetype
                    if self.record(url, depth):
                        self.create_web_file(data, len(self.crawledlist))
                        print(url)
                        print("Finished:", len(self.crawledlist), "files")
                        print("current depth: ", depth)
                        nextLevelFrontier += parser.urls
                # try to catch errors when encounter
                except urllib.error.HTTPError as err:
                    # handling page not found error
                    if err.code == 404:
                        continue
                    else:
                        raise
        self.crawl(depth + 1, nextLevelFrontier)

Example #9

0

Show file

    def dataFetcher(self, jobname=None):
        global masterList
        global masterDictionary
        global jobName
        global prePost
        # MEATHOD OVERLOAING
        if jobname == None:
            mydatafetcher = URLCreator(jobname=self.jobName)
        else:
            mydatafetcher = URLCreator(jobname=jobname)

        contents = mydatafetcher.loadUrl()
        parser = MyHTMLParser()
        root = parser.feed(contents)
        postdependencyList = list(set(parser.postList))
        predependencyList = list(set(parser.preList))
        # print('pre:',predependencyList)
        return predependencyList, postdependencyList

Example #10

0

Show file

File: index.py Project: stevepatter/flask-site

def image_src(flickr_description):
    p = MyHTMLParser()
    p.feed(flickr_description)
    return p.src

Example #11

0

Show file

File: ImageAltText.py Project: djghostghost/hybris_impex_tool

def getImageTagFromHtml(file):

	MyHTMLParser.feed(file)

	return MyHTMLParser.map

Example #12

0

Show file

File: ImageAltText.py Project: djghostghost/hybris_impex_tool

			fname,fext=os.path.splitext(file)

			if (fext in s for s in filetype):
				filelist.append(os.path.join(rootpath,file))

	return filelist



def getImageTagFromHtml(file):

	MyHTMLParser.feed(file)

	return MyHTMLParser.map


def getImageAltTextByImageName(image):
	return

if __name__=="__main__":


	files=getAllHtmlFiles(rootpath,filetype)
	for file in files:
		fileobject=open(file,'r')
		data=fileobject.read()
		htmlParse=MyHTMLParser()
		htmlParse.feed(data)

Example #13

0

Show file

File: parse.py Project: Vaporbook/iascraper

# grab links from html stdin input and canonicalize them
# spit them out on newlines for processing

from MyHTMLParser import MyHTMLParser
import fileinput
import sys
import argparse

argparser = argparse.ArgumentParser(description='Parse an html dump of an IA page for matching links')
#argparser.add_argument('--path', dest='path',
#                   default='/',
#                   help='path to match in links')

args = argparser.parse_args()

parser = MyHTMLParser()
html = sys.stdin.read()
parser.feed(html)
for link in parser.get_details():
	print link

Example #14

0

Show file

File: 20up.py Project: CodingFree/20up

def backupPrivateMessages(myTuenti, email, password):
    printStarting('mensajes privados')
    
    print '| Obteniendo identificadores de tus mensajes privados'
    print '| (esto llevara algun tiempo)'
    messages = myTuenti.getInbox(0)
    totalMessages = int(messages[0]['num_threads'])
    keys = []
    
    maxFill = len(str(totalMessages))
    iters = totalMessages / 10.0
    if math.fmod(iters, 1) != 0.0:
        iters += 1
    iters = int(iters)
    
    for i in range(0, iters):
        messages = myTuenti.getInbox(i)
        for message in messages[0]['threads']:
            keys.append(message['key'])
        
        sleep(0.5)
    
    s = requests.Session()
    r = s.get('https://m.tuenti.com/?m=Login', verify=False)
    csrf = re.findall('name="csrf" value="(.*?)"', r.text)[0]

    data = { 'csrf': csrf, 'tuentiemailaddress': email, 'password': password, 'remember': 1 }
    s.post('https://m.tuenti.com/?m=Login&f=process_login', data)
    
    r = s.get("https://m.tuenti.com/?m=Profile&func=my_profile", verify=False)
    if r.text.find('email') != -1:
        print '| E-mail o password incorrectos'
        raw_input('| Pulsa ENTER para continuar')
        return
    
    rootPath = os.getcwd()
    theJoinPath = os.path.join(rootPath, 'privados')
    if not os.path.exists(theJoinPath):
        print '| Creando directorio donde se alojaran los mensajes privados...'
        os.makedirs(theJoinPath)
        print '| Directorio creado'
    os.chdir(theJoinPath)
    
    counter = 0
    parser = MyHTMLParser()
    for key in keys:
        counter += 1
        percent = 100 * counter / totalMessages
        print '| [' + str(percent) + '%] Descargando mensaje ' + \
              str(counter) + ' de ' + str(totalMessages) + '...'
        urlName = 'https://m.tuenti.com/?m=messaging&func=view_thread&thread_id='
        urlName += key + '&box=inbox&view_full=1'
        
        r = s.get(urlName, verify=False)
        
        sleep(0.5)

        parser.setFile(string.zfill(counter, maxFill))
        parser.feed(r.text)
        
    os.chdir(rootPath)

Example #15

0

Show file

File: clThread2.py Project: oscarzhouxq/learngit

                    print path

if __name__ == "__main__":
    urlStr = raw_input("url:")
    urlList = urlStr.split("/")
    pathdir = urlList[len(urlList)-1]
    v = HttpClient()
    value = v.Get(urlStr,urlStr)
    #r1 = re.compile(r"http://\S*\.jpe*g")
    from MyHTMLParser import MyHTMLParser
    parser = MyHTMLParser()
    value = value.decode('gbk').encode('utf-8')
    print value
    
   
    parser.feed(value,"input")
    nodes = parser.get_nodes()
    print nodes
    srcList = []
    for node in nodes:
        for attr in node["attrs"]:
            if attr == "src":
                srcList.append(node["attrs"][attr])
                #print node["attrs"][attr]


    threads = []
    i = 1
    j =len(srcList)
    for t in srcList:
        threads.append(getMyimg(i,j))

Example #16

0

Show file

File: FindHtml.py Project: ImmortalHalfWu/StockOrder

    )
    req.add_header('Referer', 'http://gupiao.jd.com/find/12195')
    req.add_header('Host', 'gupiao.jd.com')
    req.add_header('Origin', 'http://gupiao.jd.com')
    req.add_header(
        'Cookie',
        'TrackID=1zjctpUkfXiPPpd2-FlJw52fq9gkx9v0WGqH_4sECdaGDpJ8D_58Bqx-Bx4HQsVMYTsT5X4AEec9ZtKVXPzJEMA; pinId=EX7C17pLL2_bXrUjzBWQTQ; __jdv=204210054|direct|-|none|-|1531620946230; _jrda=3; sec_flag=e125e94ccd30d095203da363b24adad3; sec_addr=c0a8006c; wlfstk_smdl=uj4fvqhhhqq66p2ddnrgf4vw8a2cggkb; 3AB9D23F7A4B3C9B=XG5I3N4FBWQZLN7HPAC56MKB755NV4K4D6CA6ICAOGCMBJBKMFJPJFYCRFOUFX7YP4IHFLD3YJJESRXWWTFXSHEVFM; __jda=204210054.1495960752486274042302.NaN.1525092662.1531620946.23; __jdb=204210054.10.1495960752486274042302|23.1531620946; __jdc=204210054; __jdu=1495960752486274042302; _jrdb=1531621024187'
    )
    req.add_header('Content-Type',
                   'application/x-www-form-urlencoded; charset=UTF-8')
    response = urllib2.urlopen(req)
    string = response.read().replace("\n", "").replace("\t", "").replace(
        " ", "").replace("%", "")
    # print string
    htmlParser = MyHTMLParser()
    htmlParser.feed(string)

    # 对象转Json
    parserDict = htmlParser.__dict__

    try:
        parserDict.pop('interesting')
        parserDict.pop('lasttag')
        parserDict.pop('lineno')
        parserDict.pop('offset')
        parserDict.pop('cdata_elem')
        parserDict.pop('rawdata')
        parserDict.pop('_HTMLParser__starttag_text')
        parserDict.pop('index')
        # parserDict['buyNumStart'] = "100.00"
        # parserDict['buyNumEnd'] = "0.00"

Example #17

0

Show file

File: 1pop.py Project: hockbase/backup

# coding=utf-8

import re
import urllib2
from MyHTMLParser import MyHTMLParser

url = 'http://ru.dhgate.com/'

if __name__ == "__main__":
    data = urllib2.urlopen(url).read()
    hp = MyHTMLParser()
    hp.feed(data)
    hp.close()
    for link in hp.links:
        print link
        a = urllib2.urlopen(link).getcode()
        print a,link

Example #18

0

Show file

# MYHTMLParser
from MyHTMLParser import MyHTMLParser


class MyHTMLParser(HTMLParser):
    def start_tag(self, tag, attrs):
        print("encountered a start tag: ", tag)

    def end_tag(self, tag):
        print("encountered an end tag : ", tag)

    def handel_data(self, data):
        print("encountered some data is : ", data)


parser = MyHTMLParser()

parser.feed = ('<html><head><title>test</title></head>'
               "<body><h1>parse me!</h1></body></html>")

Example #19

0

Show file

File: htmlProcessor.py Project: iampueroo/small-projects

'''
Current driver simple HTMl processor.
which builds DOM tree.
'''
from MyHTMLParser import MyHTMLParser

mockData = "<html><head><title>This is the</title></head><body><h1>This<br /><>is the <span>header</span></h1></body></html>"

cursor = MyHTMLParser()
cursor.feed(mockData) #builds tree

cursor.printTree()	#prints tree!	

def findElementsByClass(className):
	return cursor.findElementsByClass(className)