def setIndex(): from MyHTMLParser import MyHTMLParser import os HTMLlist = [] nobody = [ "index.html", "index1.html", "index2.html", "index3.html", "index4.html", "ru3.html", "ru2.html", "ru1.html", "rut.html", "rus.html", "ru.html" ] for (parent, d, f) in os.walk(NorroenDyrd.mirror): for fn in f: if fn in nobody: continue elif fn.find(".html") == -1: continue elif os.path.join(parent, fn) in HTMLlist: continue else: HTMLlist.append(os.path.join(parent, fn)) html = [] for h in HTMLlist: entry = {} with open(h, "r", encoding="utf-8") as f: html = f.readlines() parser = MyHTMLParser() for i in html: parser.feed(i) entry["path"] = h.replace(NorroenDyrd.mirror, NorroenDyrd.base) entry["text"] = parser.plaintext entry["title"] = parser.title NorroenDyrd.index.append(entry) del parser
class Inverter(): #global doc_id, term_count def __init__(self, config): self.config = config self.ht = myhashtable(config) self.htmlparser = MyHTMLParser(self.config, self.ht) self.start_batch_processing() self.write_file_map() self.ht.write_posting_file(term_count) self.ht.write_hash_table() def start_batch_processing(self): file_id = 0 for in_file in os.listdir(self.config['str_src_dir']): #if in_file not in ['medium.html','simple.html']: continue #for testing with open(self.config['str_src_dir'] + in_file, 'r') as f: doc_id[file_id] = in_file term_count[file_id] = 0 self.htmlparser.feed(f.read(), file_id) file_id += 1 # writing doc_id <--> doc_name file def write_file_map(self): #writing document id file with open( self.config['str_dst_dir'] + self.config['str_doc_id_file_name'], 'wb+') as f: for did, txt in doc_id.iteritems(): f.write('{0:0>{1}d} {2:' '<{3}s}\n'.format(did, self.config['file_id_encoding_len'], txt, self.config['file_name_len']))
def f(idx, q,r): path = "data%s"%(idx) os.makedirs(path) while True: item = q.get() if( item.item_type == ITEM_QUIT ): break; count = 0 localQueue = Queue() current = item.data while True: print current fo = urlopen(current) data = fo.read() name = "%s/%s"%(path,count) fw = open( name, "w" ) count = count + 1 fw.write(data) fw.close() fo.close() p = MyHTMLParser() try: p.feed(data) except: pass for href in p.hrefs: print item.data, ": ", href try: current = localQueue.get_nowait() except: break;
def create_journey_instructions(steps): parser = MyHTMLParser() # HTML parser for directions API data instruct = "" for step in steps: parser.feed(step['html_instructions']) instruct += parser.get_data() + ">>>>>" print(instruct) return instruct
def table_maker(pd_row): #read data from html-like file h = MyHTMLParser() h.feed(pd_row['data']) soup = soupparser() p = soup.handle_data(pd_row) dividendpershare = [h.type_dividendpershare, h.asofdate_dividendpershare, h.reporttype_dividendpershare, h.period_dividendpershare, h.currency_dividendpershare, p.data_dividendpershare, h.exdate_dividendpershare, h.recorddate_dividendpershare, h.paydate_dividendpershare, h.declarationdate_dividendpershare] totalrevenue = [h.type_totalrevenue, h.asofdate_totalrevenue, h.reporttype_totalrevenue, h.period_totalrevenue, h.currency_totalrevenue, p.data_totalrevenue, h.exdate_totalrevenue, h.recorddate_totalrevenue, h.paydate_totalrevenue, h.declarationdate_totalrevenue] dividend = [h.type_dividend, h.asofdate_dividend, h.reporttype_dividend, h.period_dividend, h.currency_dividend, p.data_dividend, h.exdate_dividend, h.recorddate_dividend, h.paydate_dividend, h.declarationdate_dividend] eps = [h.type_eps, h.asofdate_eps, h.reporttype_eps, h.period_eps, h.currency_eps, p.data_eps, h.exdate_eps, h.recorddate_eps, h.paydate_eps, h.declarationdate_eps] #sort data and make it into a dataframe names = ['type', 'asofdate', 'reporttype', 'period', 'currency','data', 'exdate', 'recorddate', 'paydate', 'declarationdate'] def make_dataframe(list1): dict1 = {names[i]: list1[i] for i in range(10)} dataframe1 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in dict1.items()])) dataframe1 = dataframe1.fillna(method='ffill') return dataframe1 dividendpershare_dataframe = make_dataframe(dividendpershare) totalrevenue_dataframe = make_dataframe(totalrevenue) dividend_dataframe = make_dataframe(dividend) eps_dataframe = make_dataframe(eps) table1 = pd.concat([dividendpershare_dataframe, totalrevenue_dataframe, dividend_dataframe, eps_dataframe], axis = 0, ignore_index=True) reqId1 = [pd_row['reqId']] * len(table1['type']) table1['reqId'] = pd.Series(np.array(reqId1), index = table1.index) #format each column to put into sql table1['type'] = table1['type'].astype(str) table1['reporttype'] = table1['reporttype'].astype(str) table1['period'] = table1['period'].astype(str) table1['asofdate'] = pd.to_datetime(table1['asofdate']) table1['exdate'] = pd.to_datetime(table1['exdate']) table1['recorddate'] = pd.to_datetime(table1['recorddate']) table1['paydate'] = pd.to_datetime(table1['paydate']) table1['declarationdate'] = pd.to_datetime(table1['declarationdate']) #drop_duplicate line table1 = table1.drop_duplicates() return table1
def downloadPictures(): curl = MyCurl() curl.set_url(MyUriEncode.getUrl()) buffer = BytesIO() buffer = curl.set_buffer(buffer) curl.exec() curl.close() body = buffer.getvalue().decode('utf-8') parser = MyHTMLParser() parser.feed(body)
def getListOfPokemonPages(): pokemonListAddress = "http://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number" pokeListResponse = urllib.request.urlopen(pokemonListAddress) pokeListPage = str(pokeListResponse.read()) parser = MyHTMLParser() parser.feed(pokeListPage) baseBulbapediaAdress = "http://bulbapedia.bulbagarden.net" for i, link in enumerate(parser.pokeListParser.pokemonURLs): parser.pokeListParser.pokemonURLs[i] = baseBulbapediaAdress + link return parser.pokeListParser.pokemonURLs
def crawl(self, depth, frontier): if depth > self.maxdepth: return nextLevelFrontier = list() for url in frontier: # only parse when the number of crawled pages are not exceeding maximum if len(self.crawledlist ) < self.numPages and url not in self.crawledlist: # pass in the URL and create the request request = req.Request( url, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" }) try: time.sleep(1) # send the request to the url and get the response data = req.urlopen(request).read().decode("utf-8") parser = MyHTMLParser() parser.feed(data) # Handling Nonetype if self.record(url, depth): self.create_web_file(data, len(self.crawledlist)) print(url) print("Finished:", len(self.crawledlist), "files") print("current depth: ", depth) nextLevelFrontier += parser.urls # try to catch errors when encounter except urllib.error.HTTPError as err: # handling page not found error if err.code == 404: continue else: raise self.crawl(depth + 1, nextLevelFrontier)
def dataFetcher(self, jobname=None): global masterList global masterDictionary global jobName global prePost # MEATHOD OVERLOAING if jobname == None: mydatafetcher = URLCreator(jobname=self.jobName) else: mydatafetcher = URLCreator(jobname=jobname) contents = mydatafetcher.loadUrl() parser = MyHTMLParser() root = parser.feed(contents) postdependencyList = list(set(parser.postList)) predependencyList = list(set(parser.preList)) # print('pre:',predependencyList) return predependencyList, postdependencyList
def image_src(flickr_description): p = MyHTMLParser() p.feed(flickr_description) return p.src
def getImageTagFromHtml(file): MyHTMLParser.feed(file) return MyHTMLParser.map
fname,fext=os.path.splitext(file) if (fext in s for s in filetype): filelist.append(os.path.join(rootpath,file)) return filelist def getImageTagFromHtml(file): MyHTMLParser.feed(file) return MyHTMLParser.map def getImageAltTextByImageName(image): return if __name__=="__main__": files=getAllHtmlFiles(rootpath,filetype) for file in files: fileobject=open(file,'r') data=fileobject.read() htmlParse=MyHTMLParser() htmlParse.feed(data)
# grab links from html stdin input and canonicalize them # spit them out on newlines for processing from MyHTMLParser import MyHTMLParser import fileinput import sys import argparse argparser = argparse.ArgumentParser(description='Parse an html dump of an IA page for matching links') #argparser.add_argument('--path', dest='path', # default='/', # help='path to match in links') args = argparser.parse_args() parser = MyHTMLParser() html = sys.stdin.read() parser.feed(html) for link in parser.get_details(): print link
def backupPrivateMessages(myTuenti, email, password): printStarting('mensajes privados') print '| Obteniendo identificadores de tus mensajes privados' print '| (esto llevara algun tiempo)' messages = myTuenti.getInbox(0) totalMessages = int(messages[0]['num_threads']) keys = [] maxFill = len(str(totalMessages)) iters = totalMessages / 10.0 if math.fmod(iters, 1) != 0.0: iters += 1 iters = int(iters) for i in range(0, iters): messages = myTuenti.getInbox(i) for message in messages[0]['threads']: keys.append(message['key']) sleep(0.5) s = requests.Session() r = s.get('https://m.tuenti.com/?m=Login', verify=False) csrf = re.findall('name="csrf" value="(.*?)"', r.text)[0] data = { 'csrf': csrf, 'tuentiemailaddress': email, 'password': password, 'remember': 1 } s.post('https://m.tuenti.com/?m=Login&f=process_login', data) r = s.get("https://m.tuenti.com/?m=Profile&func=my_profile", verify=False) if r.text.find('email') != -1: print '| E-mail o password incorrectos' raw_input('| Pulsa ENTER para continuar') return rootPath = os.getcwd() theJoinPath = os.path.join(rootPath, 'privados') if not os.path.exists(theJoinPath): print '| Creando directorio donde se alojaran los mensajes privados...' os.makedirs(theJoinPath) print '| Directorio creado' os.chdir(theJoinPath) counter = 0 parser = MyHTMLParser() for key in keys: counter += 1 percent = 100 * counter / totalMessages print '| [' + str(percent) + '%] Descargando mensaje ' + \ str(counter) + ' de ' + str(totalMessages) + '...' urlName = 'https://m.tuenti.com/?m=messaging&func=view_thread&thread_id=' urlName += key + '&box=inbox&view_full=1' r = s.get(urlName, verify=False) sleep(0.5) parser.setFile(string.zfill(counter, maxFill)) parser.feed(r.text) os.chdir(rootPath)
print path if __name__ == "__main__": urlStr = raw_input("url:") urlList = urlStr.split("/") pathdir = urlList[len(urlList)-1] v = HttpClient() value = v.Get(urlStr,urlStr) #r1 = re.compile(r"http://\S*\.jpe*g") from MyHTMLParser import MyHTMLParser parser = MyHTMLParser() value = value.decode('gbk').encode('utf-8') print value parser.feed(value,"input") nodes = parser.get_nodes() print nodes srcList = [] for node in nodes: for attr in node["attrs"]: if attr == "src": srcList.append(node["attrs"][attr]) #print node["attrs"][attr] threads = [] i = 1 j =len(srcList) for t in srcList: threads.append(getMyimg(i,j))
) req.add_header('Referer', 'http://gupiao.jd.com/find/12195') req.add_header('Host', 'gupiao.jd.com') req.add_header('Origin', 'http://gupiao.jd.com') req.add_header( 'Cookie', 'TrackID=1zjctpUkfXiPPpd2-FlJw52fq9gkx9v0WGqH_4sECdaGDpJ8D_58Bqx-Bx4HQsVMYTsT5X4AEec9ZtKVXPzJEMA; pinId=EX7C17pLL2_bXrUjzBWQTQ; __jdv=204210054|direct|-|none|-|1531620946230; _jrda=3; sec_flag=e125e94ccd30d095203da363b24adad3; sec_addr=c0a8006c; wlfstk_smdl=uj4fvqhhhqq66p2ddnrgf4vw8a2cggkb; 3AB9D23F7A4B3C9B=XG5I3N4FBWQZLN7HPAC56MKB755NV4K4D6CA6ICAOGCMBJBKMFJPJFYCRFOUFX7YP4IHFLD3YJJESRXWWTFXSHEVFM; __jda=204210054.1495960752486274042302.NaN.1525092662.1531620946.23; __jdb=204210054.10.1495960752486274042302|23.1531620946; __jdc=204210054; __jdu=1495960752486274042302; _jrdb=1531621024187' ) req.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') response = urllib2.urlopen(req) string = response.read().replace("\n", "").replace("\t", "").replace( " ", "").replace("%", "") # print string htmlParser = MyHTMLParser() htmlParser.feed(string) # 对象转Json parserDict = htmlParser.__dict__ try: parserDict.pop('interesting') parserDict.pop('lasttag') parserDict.pop('lineno') parserDict.pop('offset') parserDict.pop('cdata_elem') parserDict.pop('rawdata') parserDict.pop('_HTMLParser__starttag_text') parserDict.pop('index') # parserDict['buyNumStart'] = "100.00" # parserDict['buyNumEnd'] = "0.00"
# coding=utf-8 import re import urllib2 from MyHTMLParser import MyHTMLParser url = 'http://ru.dhgate.com/' if __name__ == "__main__": data = urllib2.urlopen(url).read() hp = MyHTMLParser() hp.feed(data) hp.close() for link in hp.links: print link a = urllib2.urlopen(link).getcode() print a,link
# MYHTMLParser from MyHTMLParser import MyHTMLParser class MyHTMLParser(HTMLParser): def start_tag(self, tag, attrs): print("encountered a start tag: ", tag) def end_tag(self, tag): print("encountered an end tag : ", tag) def handel_data(self, data): print("encountered some data is : ", data) parser = MyHTMLParser() parser.feed = ('<html><head><title>test</title></head>' "<body><h1>parse me!</h1></body></html>")
''' Current driver simple HTMl processor. which builds DOM tree. ''' from MyHTMLParser import MyHTMLParser mockData = "<html><head><title>This is the</title></head><body><h1>This<br /><>is the <span>header</span></h1></body></html>" cursor = MyHTMLParser() cursor.feed(mockData) #builds tree cursor.printTree() #prints tree! def findElementsByClass(className): return cursor.findElementsByClass(className)