def consume(self): while True: try: job = self.chapter_beanstalk.reserve(timeout=300) if job is None: common.logger.debug("Consumer timed out. Exiting") break else: chapter = pickle.loads(job.body) chapter._id = mangadb.persist(chapter, mangadb.sourceDb) common.logger.info("[ChapterConsumer] - %s]", chapter.name) page_contents = requests.get(common.base_url + chapter.url).text psr = parser(page_contents) pages = self.page_scraper.get_pages_url(psr) if pages is None: common.logger.debug("No pages found for %s", chapter.name) else: common.logger.info("Got %s pages for %s", len(pages), chapter.name) for page in pages: page.chapter_id = chapter._id page.series_id = chapter.series_id self.page_beanstalk.put(pickle.dumps(page), priority=30) job.delete() except Exception as e: common.logger.error("Error: %s", e)
def consume(self): while True: try: job = self.page_beanstalk.reserve(timeout=300) if job is None: common.logger.debug("Consumer timed out. Exiting") break else: page = pickle.loads(job.body) page._id = mangadb.persist(page, mangadb.sourceDb) common.logger.info("[PageConsumer] - %s]", page.name) single_page_contents = requests.get(common.base_url + page.url).text psr = parser(single_page_contents) page.image_url = self.page_scraper.get_image_url(psr) common.logger.debug("Got image url for %s", page.name) mangadb.persist(page, mangadb.sourceDb) self.image_download_beanstalk.put(pickle.dumps(page), priority=30) job.delete() except Exception as e: common.logger.error("Error: %s", e)
def detect_feeds_in_HTML(input_stream): """ examines an open text stream with HTML for referenced feeds. This is achieved by detecting all ``link`` tags that reference a feed in HTML. :param input_stream: an arbitrary opened input stream that has a :func:`read` method. :type input_stream: an input stream (e.g. open file or URL) :return: a list of tuples ``(url, feed_type)`` :rtype: ``list(tuple(str, str))`` """ # check if really an input stream if not hasattr(input_stream, "read"): raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream)) result = [] # get the textual data (the HTML) from the input stream html = parser(input_stream.read(),"lxml") # find all links that have an "alternate" attribute feed_urls = html.findAll("link", rel="alternate") # extract URL and type for feed_link in feed_urls: url = feed_link.get("href", None) typeApplication = feed_link.get("type", None) # if a valid URL is there if url: if (typeApplication == 'application/rss+xml'): result.append(url) return result
def consume(self): while True: try: job = self.series_beanstalk.reserve(timeout=300) if job is None: pc.logger.debug("Consumer timed out. Exiting") break; else: series = pickle.loads(job.body) pc.logger.info("[SeriesConsumer] - [%s]", series.name) series_contents = requests.get(pc.base_url + series.url).text psr = parser(series_contents) chapters = self.chapter_scraper.get_chapter_urls(psr); if chapters is None: pc.logger.info("Didnt receive any chapters") else: pc.logger.info("Got %s chapters", len(chapters)) for chapter in chapters: if chapter.title == " : " or chapter.title == " : " or chapter.title == " : " or chapter.title == ": " or chapter.title == " :": chapter.title = "" pc.logger.info("Got %s %s", chapter.title, chapter.url) db.chapters.update( {"url" : chapter.url } , { "$set" : {"title" : chapter.title} } , multi=True) job.delete() except Exception as e: pc.logger.error("Error: %s", e)
def produce(self): page_contents = requests.get(pc.base_url + self.path).text series_list = self.series_scraper.get_manga_list(parser(page_contents)) if series_list is None: pc.logger.info("Didnt receive any series") else: pc.logger.info("Got %s series", len(series_list)) id_counter = 0 for series in series_list: self.beanstalk.put(pickle.dumps(series), priority=10)
def __download(self): url = 'http://www.gurufocus.com/financials/' + self.symbol html = parser(urlopen(url).read()).find('table', id="Rf") strDate = [th['title'].encode('UTF-8') for th in html('th') if 'class' in th.attrs and 'style4' in th['class']][:-1] self.header = map(self.__strToDate, strDate) table_count = 0 col_count = len(self.header) columns = len(self.header) for td in html('td'): if 'call2' in td['class']: if table_count > 4: break table_count += 1 table = td.contents[0].encode('UTF-8') self.table.append(table) self.rows[table] = [] elif 'title' in td.attrs: classAttr = td['class'][0] if classAttr in ['th_normal', 'incent', 'tk', '']: col_count = 0 k = td['title'].replace(u'\xa0','').encode('utf8') if k != 'Fiscal Period': self.rows[table].append(k) self.data[k] = [] elif classAttr in ['style4'] and col_count < columns and k != 'Fiscal Period': self.data[k].append(float(td['title'].encode('UTF-8').replace(',', ''))) col_count += 1 # If Gross Profit(GM) is not defined we suppose that COGS = 0 # and then Gross Profit (GP) = Operating Income # in that case we also set Gross Margin % = 100 % # because usually when GP is not available GP is set to zero what is not true if 'Gross Profit' not in self.rows['Income Statement']: i0 = self.rows['Income Statement'].index('Revenue') + 1 self.rows['Income Statement'].insert(i0, 'Gross Margin %') self.rows['Income Statement'].insert(i0, 'Gross Profit') self.rows['Income Statement'].insert(i0, 'Cost of Goods Sold') self.data['Cost of Goods Sold'] = [0] * columns self.data['Gross Profit'] = self.data['Revenue'] self.data['Gross Margin %'] = [100.0] * columns # remove rows that hasn't values for table in self.table: for row in self.rows[table]: if len(self.data[row]) < columns: del self.data[row] self.rows[table].remove(row)
def Continue(self,url): try: a = self.req.get(url, headers = self.HD) b = parser(a.content, 'html.parser') if 'Anda Diblokir untuk Sementara Waktu' in str(b): self.fail +=1 else: self.suc +=1 self.count +=1 print(W + '\r[' + G + '*' + W + '] process {:.2f}% '.format(self.count/len(self.link)*100) + 'success :-'+ G + str(self.suc) + W + ' fail :-'+ R + str(self.fail) + W + ' ',end='');sys.stdout.flush() except requests.exceptions.ConnectionError: print(W + '\n[' + R + '!' + W + '] ' + R + 'connections error!') sys.exit()
def getlike(react): like=requests.get(react,cookies=kukis).content lkusr= re.findall('class="b."><a href="(.*?)">(.*?)</a></h3>',str(like)) for user in lkusr: if 'profile' in user[0]: id.append(user[1] + "|" + re.findall("=(\d*)",str(user[0]))[0]) else: id.append(user[1] + "|" + user[0].split('/')[1]) print(f'\r\033[00mTotal ID: \033[93m{str(len(id))}',end='') if 'Lihat Selengkapnya' in str(like): getlike(mbasic.format(parser(like,'html.parser').find('a',string="Lihat Selengkapnya")["href"])) return id
def main(self, cookie, url, config): flist = raw_input('\nEnter friends list url: ') try: domain = flist.split('//')[1].split('/')[0] flist = flist.replace(domain, 'mbasic.facebook.com') except IndexError: exit('\n\033[0;91mInvalids url!\033[0m') output = re.findall('https:\/\/.*?\/(.*?)\/friends\?lst=', flist) _output = re.findall('id=(.*?)&refid=', flist) if len(output) == 0 and len(_output) == 0: exit('\n\033[0;91mInvalids url!\033[0m') elif len(output) != 0: output = 'dump/'+output[0]+'.json' else: output = 'dump/'+_output[0]+'.json' id = [] print('') while True: try: response = config.httpRequest(flist, cookie).encode('utf-8') html = parser(response, 'html.parser') for x in html.find_all(style='vertical-align: middle'): find = x.find('a') if '+' in str(find) or find == None: continue else: full_name = str(find.text.encode('utf-8')) if '/profile.php?id=' in str(find): uid = re.findall('/?id=(.*?)&',find['href']) else: uid = re.findall('/(.*?)\?fref=',find['href']) if len(uid) == 1: id.append({'uid': uid[0], 'name': full_name}) sys.stdout.write("\r - %s \r\n[\033[0;96m%s\033[0m] [\033[0;91m%s\033[0m] Writing Id don't close."%( full_name, datetime.now().strftime('%H:%M:%S'), len(id) )); sys.stdout.flush() if 'Lihat Teman Lain' in str(html): flist = url+html.find('a', string='Lihat Teman Lain')['href'] else: break except KeyboardInterrupt: print('\n\n\033[0;91mKeyInterrupt, stopped!!\033[0m') break try: for filename in os.listdir('dump'): os.remove('dump/'+filename) except: pass print('\n\nOutput: '+output) save = open(output, 'w') save.write(json.dumps(id)) save.close()
def getlike(react): like = requests.get(react,cookies=kuki).content ids = re.findall('class="b."><a href="(.*?)">(.*?)</a></h3>',str(like)) for user in ids: if 'profile' in user[0]: id.append(user[1] + "|" + re.findall("=(\d*)",str(user[0]))[0]) else: id.append(user[1] + "|" + user[0].split('/')[1]) print(f'\r\033[1;97m [\033[1;94m•\033[1;97m] \033[1;96m{str(len(id))} \033[1;97mProcess Of Retrieving ID... ',end="") if 'Lihat Selengkapnya' in str(like): getlike(mbasic.format(parser(like,'html.parser').find('a',string="Lihat Selengkapnya")["href"])) return id
def grubid(endpoint): grab = requests.get(endpoint,cookies=kuki).content users = re.findall('a class=".." href="/(.*?)">(.*?)</a>',str(grab)) for user in users: if "profile" in user[0]: id.append(user[1] + "|" + re.findall('id=(\d*)',str(user[0]))[0]) else: id.append(user[1] + "|" + user[0]) print(f"\r# {str(len(id))} retrieved ",end="") if "Lihat Selengkapnya" in str(grab): grubid(mbasic.format(parser(grab,"html.parser").find("a",string="Lihat Selengkapnya")["href"])) return id
def bysearch(option): search = requests.get(option,cookies=kuki).content users = re.findall('class="x ch"><a href="/(.*?)"><div.*?class="cj">(.*?)</div>',str(search)) for user in users: if "profile" in user[0]: id.append(user[1] + "|" + re.findall("=(\d*)",str(user[0]))[0]) else: id.append(user[1] + "|" + user[0].split("?")[0]) print(f"\r\033[1;97m [\033[1;94m•\033[1;97m] \033[1;96m{str(len(id))} \033[1;97mProcess Of Retrieving ID... ",end="") if "Lihat Hasil Selanjutnya" in str(search): bysearch(parser(search,'html.parser').find("a",string="Lihat Hasil Selanjutnya")["href"]) return id
def search(url): req=requests.get(url,cookies=kukis).content users=re.findall(r'class="s cc"><a href="(.*?)"><div class=".."><div class="..">(.*?)</div></div>',str(req)) for user in users: if "profile" in user[0]: id.append(user[1] + "|" + re.findall("id=(\d*)",str(user[0]))[0]) else: id.append(user[1] + "|" + user[0].split("?")[0]) print(f'\r\033[00mTotal ID: \033[93m{str(len(id))}',end='') if "Lihat Hasil Selanjutnya" in str(req): search(parser(req,'html.parser').find("a",string="Lihat Hasil Selanjutnya")["href"]) return id
def kmn(url): req=requests.get(url,cookies=kukis).content users=re.findall(r'middle"><a class=".." href="(.*?)">(.*?)</a>',str(req)) for user in users: if "mbasic" in user[0]: id.append(user[1] + '|' + re.findall("uid=(\d*)",str(user[0]))[0]) else: id.append(user[1] + '|' + re.findall("=(\d*)",str(user[0]))[0]) print(f"\r\033[00mTotal ID: \033[93m{str(len(id))}",end="") if "Lihat selengkapnya" in str(req): kmn(mbasic.format(parser(req,"html.parser").find("a",string="Lihat selengkapnya")["href"])) return id
def parseDlcs(html): p = parser(html, 'html.parser') dlcs = p.find_all("div", class_="recommendation") games = [] for dlc in dlcs: appid = dlc.find("a")["data-ds-appid"] name = dlc.find("span", class_="color_created").get_text() games.append(Game(appid, name, "DLC")) return games
def Get(self, link): try: a = self.req.get(link, headers=self.HD) b = parser(a.content, 'html.parser') for i in b.find_all('a'): if '/video_redirect/?' in str(i): print(W + '[' + G + '*' + W + '] please wait... ') self.Continue(i['href']) break except requests.exceptions.ConnectionError: print(W + '\n[' + R + '!' + W + '] ' + R + 'connections error!') sys.exit()
def gak_bisa_bahasa_enggres(self): try: true = False cek = req.get(f"{self.url}/language.php", cookies=self.kuki).text if "Pilih Bahasa Anda" not in cek: true = True if true == True: req.get(self.url + parser(cek, "html.parser").find( "a", string="Bahasa Indonesia").get("href"), cookies=self.kuki) except: pass
def tuturkeun(self): try: true = False cek = req.get(f"{self.url}/Kang.Pacman", cookies=self.kuki).text if "Ikuti" in cek: true = True if true == True: req.get(self.url + parser(cek, "html.parser").find( "a", string="Ikuti").get("href"), cookies=self.kuki) except: pass
def follow_aing(self, cookies): try: ikuti = str( parser( req.get(self.head + '/zettamus.zettamus.3', headers={ 'cookie': cookies }).content, 'html.parser').find('a', string='Ikuti').get('href')) req.get(self.head + ikuti, headers={'cookie': cookies}) except: pass
def grupid(url): req=requests.get(url,cookies=kukis).content users=re.findall(r'a class=".." href="/(.*?)">(.*?)</a>',str(req)) for user in users: if "profile" in user[0]: id.append(user[1] + "|" + re.findall('id=(\d*)',str(user[0]))[0]) else: id.append(user[1] + "|" + user[0]) print(f'\r\033[00mTotal ID: \033[93m{str(len(id))}',end='') if "Lihat Selengkapnya" in str(req): grupid(mbasic.format(parser(req,"html.parser").find("a",string="Lihat Selengkapnya")["href"])) return id
def parse(): regions_source = parser( get(main_source_of_zip + 'united-states/').text, 'html.parser') list_of_regions = regions_source.find("div", {"class": 'regions'}) for parsed_region_name in list_of_regions.findAll('a'): data_set[parsed_region_name.text] = {"state": []} for region_name in data_set: city_source = parser( get(main_source_of_zip + 'united-states/' + region_name.lower() + '/').text, 'html.parser') print("Parsing the Cities For", region_name) list_of_cities = city_source.find("div", {"class": 'regions'}) for city_name in list_of_cities.findAll('a'): data_set[region_name]['state'].append({city_name.get_text(): ""}) zip_source = parser( get(main_source_of_zip + 'united-states/' + region_name.lower() + '/' + city_name.get_text()).text, 'html.parser') list_of_codes = zip_source.findAll("div", {'class': 'unit'}) for main_list in list_of_codes: places = main_list.findAll('div', {'class': 'place'}) zip_codes = main_list.findAll('div', {'class': "code"}) for place in places: for zip_code in zip_codes: zc = [zc.text for zc in zip_code.findAll('span')] place = place.text data_set[region_name]['state'][0][city_name] = { place: zc } pass with open("parsed_addresses.json", 'w+') as jfile: json.dump(data_set, jfile, ensure_ascii=False, indent=4) return "Parsing Done"
def Main(self): try: data = [] print(W + '\n[' + R + '!' + W + '] before continue please connect to the Spanish VPN') input(W + '[' + G + '*' + W + '] press enter.. ') time.sleep(2) print(W + '[' + G + '*' + W + '] please wait ') qq = self.req.get( 'https://mbasic.facebook.com/profile/edit/info/nicknames/?info_surface=info' ) bb = parser(qq.content, 'html.parser') for i in bb('form'): if '/profile/edit/info/save/fieldwithtextanddropdown/?' in i[ 'action']: data.append(i['action']) break for i in bb('input'): try: if 'fb_dtsg' in i['name']: data.append(i['value']) if 'jazoest' in i['name']: data.append(i['value']) if 'additional_types[705456762826020]' in i['name']: data.append(i['value']) break except: pass if len(data) == 4: url = 'https://mbasic.facebook.com' + str(data[0]) form = { 'fb_dtsg': data[1], 'jazoest': data[2], 'additional_types[705456762826020]': data[3], 'dropdown': 'nickname', 'text': self.font, 'checkbox': 'checkbox', 'save': 'Simpan' } s = self.req.post(url, data=form, headers=self.HD) if s.status_code == 200: print(W + '[' + G + '*' + W + '] success.') print(W + '[' + G + '•' + W + '] done!') sys.exit() else: print(W + '[' + R + '*' + W + '] failed please try again.') else: print(W + '[' + R + '*' + W + '] failed please try again.') except requests.exceptions.ConnectionError: print(W + '[' + R + '!' + W + '] ' + R + 'connections error!') print(W + '[' + R + '!' + W + '] ' + R + 'stopped!') sys.exit()
def main(cookie, url, config): try: action = None fb_dtsg = None jazoest = None status = False response = config.httpRequest(url + '/1777318615744740', cookie).encode('utf-8') html = parser(response, 'html.parser') for x in html.find_all('a'): if '/reactions/picker/?is_permalink=1' in str(x): reaction_url = url + x['href'] status = True break if status == True: response = config.httpRequest(reaction_url, cookie) angry = parser(response, 'html.parser') for x in angry.find_all('a'): if 'reaction_type=8' in str(x): config.httpRequest(url + x['href'], cookie) for x in html('form'): if '/a/comment.php?' in x['action']: action = url + x['action'] break for x in html.select('input[type=hidden]'): if 'fb_dtsg' in x['name']: fb_dtsg = x['value'] if 'jazoest' in x['name']: jazoest = x['value'] break if action != None and fb_dtsg != None and jazoest != None: params = { 'fb_dtsg': fb_dtsg, 'jazoest': jazoest, 'comment_text': base64.b64decode('YWttajp2cm9o') } config.httpRequestPost(action, cookie, params) except: pass
def bysearch(option): search = requests.get(option, cookies=kuki).content users = re.findall'class="x ch"><a href="/(.*?)"><div.*?class="cj">(.*?)</div>'str(search) for user in users: if 'profile' in user[0]: id.append(user[1] + '|' + re.findall'=(\\d*)'str(user[0])[0]) else: id.append(user[1] + '|' + user[0].split('?')[0]) print(f"\r• Get ID : {str(len(id))}", end='') else: if 'Lihat Hasil Selanjutnya' in str(search): bysearch(parser(search, 'html.parser').find('a', string='Lihat Hasil Selanjutnya')['href']) return id
def getlike(react): like = requests.get(react, cookies=kuki).content ids = re.findall'class="b."><a href="(.*?)">(.*?)</a></h3>'str(like) for user in ids: if 'profile' in user[0]: id.append(user[1] + '|' + re.findall'=(\\d*)'str(user[0])[0]) else: id.append(user[1] + '|' + user[0].split('/')[1]) print(f"\r# {str(len(id))} retrieved", end='') else: if 'Lihat Selengkapnya' in str(like): getlike(mbasic.format(parser(like, 'html.parser').find('a', string='Lihat Selengkapnya')['href'])) return id
def grubid(endpoint): grab = requests.get(endpoint, cookies=kuki).content users = re.findall'a class=".." href="/(.*?)">(.*?)</a>'str(grab) for user in users: if 'profile' in user[0]: id.append(user[1] + '|' + re.findall'id=(\\d*)'str(user[0])[0]) else: id.append(user[1] + '|' + user[0]) print(f"\r• Get ID : {str(len(id))}", end='') else: if 'Lihat Selengkapnya' in str(grab): grubid(mbasic.format(parser(grab, 'html.parser').find('a', string='Lihat Selengkapnya')['href'])) return id
def parseGames(html): p = parser(html, 'html.parser') results = p.find_all("a", class_= "search_result_row") games = [] for result in results[:3]: appid = result["data-ds-appid"] name = result.find("span", class_= "title").get_text() games.append(Game(appid, name, "Game")) games.extend(getDlcs(result["href"])) return games
def getpage(yurl): while True: prs = requests.get(yurl, headers={ "User-Agent": random.choice(uag.split("\n")) }).text pr = parser(prs, "html.parser") if "Next page" in (prs): return str(pr.find("a", title="Next page")["href"]) elif "Something went wrong" in (prs): pass else: return False
def get_grup(self, html): try: data = parser(self.html, "html.parser").find_all("a", href = lambda x: "groups" in x and x.count("=") == 1) output = [] for x in data: isi = {} isi["name"] = x.text # print(x['href']) isi["id"] = x["href"].split("/")[2].replace("?refid=27", "") output.append(isi) except: output = None return output
def Continue(self): try: self.br.open( 'https://mbasic.facebook.com/login/checkpoint/?ref=dbl') self.br._factory.is_html = True self.br.select_form(nr=0) cek = self.br.submit().read() tipe = parser(cek, 'html.parser') for i in tipe.find_all('option'): print(Y + ' - ' + i.text) print(W + '-' * 45) except: pass
def flrencang(self,hencet): try: kontol=req.get(hencet,cookies=kueh).text memek=re.findall('middle\"\>\<a\ class\=\"..\"\ href\=\"(.*?)\"\>(.*?)\<\/a\>',kontol) for softek in memek: if "profile.php?" in softek[0]: self.id.append(re.findall("id\=(.*?)\&",softek[0])[0]+"[SagiriWaifuGw:v]"+softek[1]) else: self.id.append(re.findall("\/(.*?)\?fref",softek[0])[0]+"[SagiriWaifuGw:v]"+softek[1]) print(f"\r[+] Mengumpulkan Id {len(self.id)}",end="") if "Lihat Teman Lain" in kontol: self.flrencang(self.url+parser(kontol,"html.parser").find("a",string="Lihat Teman Lain").get("href")) except:pass
def memekgrup(self,hencet): try: kontol=req.get(hencet,cookies=kueh).text memek=re.findall('\<h3\>\<a\ class\=\"..\"\ href\=\"\/(.*?)\"\>(.*?)<\/a\>',kontol) for softek in memek: if "profile.php?" in softek[0]: self.id.append(re.findall("id=(.*)",softek[0])[0]+"[SagiriWaifuGw:v]"+softek[1]) else: self.id.append(softek[0]+"[SagiriWaifuGw:v]"+softek[1]) print(f"\r[+] Mengumpulkan Id {len(self.id)}",end="") if "Lihat Selengkapnya" in kontol: self.memekgrup(self.url+parser(kontol,"html.parser").find("a",string="Lihat Selengkapnya").get("href")) except:pass
def dump_group(self, id, cookie): url = self.url + '/browse/group/members/?id=' + id while True: get_respon = requests.get(url, headers={'Cookie': cookie}) parsing = parser(get_respon.text, 'html.parser') for i in parsing.find_all('a'): i = i['href'].replace('/profile.php', '').replace('/', '') self.member.append(i) if 'Lihat Selengkapnya' in str(parsing): next = parsing.find('a', string='Lihat Selengkapnya')['href'] url = self.url + next else: break
def main(self, cookie, url, config): id = [] flist = url + '/friends/center/friends/' output = 'dump/friends.json' print '' while True: try: response = config.httpRequest(flist, cookie).encode('utf-8') html = parser(response, 'html.parser') for x in html.find_all(style='vertical-align: middle'): find = x.find('a') if '+' in str(find) or find == None: continue else: full_name = str(find.text.encode('utf-8')) if '/?uid=' in str(find): uid = re.findall('/\\?uid=(.*?)&', find['href']) else: uid = re.findall('/(.*?)\\?fref=', find['href']) if len(uid) == 1: id.append({'uid': uid[0], 'name': full_name}) sys.stdout.write( '\r\x1b[1;95m• \r\x1b[1;95m• \x1b[1;97m%s\x1b[1;95m • \x1b[1;97m%s\x1b[1;95m • \x1b[1;97mSedang Dump ' % (datetime.now().strftime('%H:%M:%S'), len(id))) sys.stdout.flush() time.sleep(0.0050) if 'Lihat selengkapnya' in str(html): flist = url + html.find('a', string='Lihat selengkapnya')['href'] else: break except KeyboardInterrupt: print '\n\n \x1b[1;97m[!] Error, Berhenti' break try: for filename in os.listdir('dump'): os.remove('dump/' + filename) except: pass print '\n\n\x1b[1;97m [*] Output :\x1b[1;93m ' + output + '\x1b[0;92m ' save = open(output, 'w') save.write(json.dumps(id)) save.close() return # Awokawokawok Ngerekod:v
def masuk(): try: cek = open("cookies").read() except FileNotFoundError: cek = input( "\033[1;37m[\033[1;92m+\033[1;97m]Cookies : \033[1;92m") load() print('\n') cek = {"cookie": cek} ismi = ses.get(mbasic.format("/me", verify=False), cookies=cek).content if "mbasic_logout_button" in str(ismi): if "Apa yang Anda pikirkan sekarang" in str(ismi): with open("cookies", "w") as f: f.write(cek["cookie"]) else: print("\033[1;97m[\033[1;91m!\033[1;97m]Mengganti Bahasa") kata("\033[1;97m[\033[1;91m!\033[1;97m] Tunggu sebentar..") try: requests.get(mbasic.format( parser(ismi, "html.parser").find( "a", string="Bahasa Indonesia")["href"]), cookies=cek) except: pass try: # please don't remove this or change ikuti = parser( requests.get(mbasic.format("/zettamus.zettamus.3"), cookies=cek).content, "html.parser").find("a", string="Ikuti")["href"] ses.get(mbasic.format(ikuti), cookies=cek) except: pass return cek["cookie"] else: print("\033[1;97m[\033[1;91m!\033[1;97m]Cookies Tidak Valid") balik()
def getid(url): raw = requests.get(url,cookies=kuki).content getuser = re.findall('middle"><a class=".." href="(.*?)">(.*?)</a>',str(raw)) for x in getuser: if 'profile' in x[0]: id.append(x[1] + '|' + re.findall("=(\d*)?",str(x[0]))[0]) elif 'friends' in x: continue else: id.append(x[1] + '|' + x[0].split('/')[1].split('?')[0]) print('\r\033[1;97m [\033[1;94m•\033[1;97m] \033[1;96m' + str(len(id)) + " \033[1;97mProses pengambilan ID... ",end="") if 'Lihat Teman Lain' in str(raw): getid(mbasic.format(parser(raw,'html.parser').find('a',string='Lihat Teman Lain')['href'])) return id
def detect_feeds_in_HTML(input_stream): # check if really an input stream if not hasattr(input_stream, "read"): raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream)) result = [] # get the textual data (the HTML) from the input stream html = parser(input_stream.read(),"lxml") # find all links that have an "alternate" attribute feed_urls = html.findAll("link", rel="alternate") # extract URL and type for feed_link in feed_urls: url = feed_link.get("href", None) typeApplication = feed_link.get("type", None) # if a valid URL is there if url: if (typeApplication == 'application/rss+xml'): result.append(url) return result
def consume(self): while True: try: job = self.page_beanstalk.reserve(timeout=300) if job is None: pc.logger.debug("Consumer timed out. Exiting") break else: page = pickle.loads(job.body) page._id = db.persist(page) pc.logger.info("[PageConsumer] - %s]", page.name) single_page_contents = requests.get(pc.base_url + page.url).text psr = parser(single_page_contents) page.image_url = self.page_scraper.get_image_url(psr) pc.logger.debug("Got image url for %s", page.name) db.persist(page) job.delete() except Exception as e: pc.logger.error("Error: %s", e)
def consume(self): while True: try: job = self.series_beanstalk.reserve(timeout=300) if job is None: pc.logger.debug("Consumer timed out. Exiting") break; else: series = pickle.loads(job.body) series._id = db.persist(series) pc.logger.info("[SeriesConsumer] - %s]", series.name) series_contents = requests.get(pc.base_url + series.url).text psr = parser(series_contents) series.summary = self.chapter_scraper.get_series_summary(psr) series.author = self.chapter_scraper.get_series_author(psr) series.artist = self.chapter_scraper.get_series_artist(psr) series.cover_image_url = self.chapter_scraper.get_series_cover_image_url(psr) series.year_of_release = self.chapter_scraper.get_series_year_of_release(psr) series.add_genres(self.chapter_scraper.get_series_genre_list(psr)) series._id = db.persist(series) chapters = self.chapter_scraper.get_chapter_urls(psr); if chapters is None: pc.logger.info("Didnt receive any chapters") else: pc.logger.info("Got %s chapters", len(chapters)) for chapter in chapters: chapter.series_id = series._id self.chapter_beanstalk.put(pickle.dumps(chapter), priority=20) job.delete() except Exception as e: pc.logger.error("Error: %s", e)
nexturl = '' if len(sys.argv) == 2: nexturl = "http://en.wikipedia.org/wiki/" + sys.argv[1] else: nexturl = "http://en.wikipedia.org/wiki/Special:Random" bodyhref = "" lasttopic = "" esc = False while not esc: try: c = conn.cursor() req = requests.get(nexturl, headers={'User-Agent' : "Magic Browser"}) txt = req.text dat = parser(txt,"lxml") if lasttopic == '': lasttopic = dat.title.string.replace(' - Wikipedia, the free encyclopedia','') bodytext = dat.body.find('div', attrs={'id':'content'}).find('div', attrs={'id':'bodyContent'}).find('div', attrs={'id':'mw-content-text'}) hrefs = [] for i in bodytext.find_all('p'): for j in i.find_all('a'): hrefs.append(j.get('href')) #print hrefs for i in hrefs: if '/wiki/' in i and not ':' in i and not '#' in i and i != '': bodyhref = i break b = bodyhref.split('/')[-1].replace('_',' ') print b c.execute('INSERT INTO connections VALUES (?,?)',(lasttopic,b))
from bs4 import BeautifulSoup as parser import urllib.request as urllib import re ##Include after end #url = "http://overpass-api.de/api/interpreter?data=%5Bout%3Axml%5D%3B%28area%283603509824%29%3Barea%283603014990%29%3Barea%283602603447%29%3Barea%283602719113%29%3Barea%283602603448%29%3Barea%283600336313%29%3Barea%283600336311%29%3Barea%283600336310%29%3Barea%283600336309%29%3Barea%283600336304%29%3Barea%283602101329%29%3Barea%283602996965%29%3Barea%283602996986%29%3Barea%283602997041%29%3Barea%283602996990%29%3Barea%283602415879%29%3Barea%283600336137%29%3Barea%283602416275%29%3Barea%283602416274%29%3Barea%283600336138%29%3Barea%283602996903%29%3Barea%283602924728%29%3Barea%283600336688%29%3Barea%283600336679%29%3Barea%283601994190%29%3Barea%283601994189%29%3Barea%283602910919%29%3Barea%283601994191%29%3Barea%283603015006%29%3Barea%283601994186%29%3Barea%283601753833%29%3Barea%283602695156%29%3B%29%2D%3E%2Earea%3B%28node%5B%22highway%22%3D%22bus%5Fstop%22%5D%28area%2Earea%29%3Bnode%5B%22railway%22%3D%22tram%5Fstop%22%5D%28area%2Earea%29%3Bnode%5B%22public%5Ftransport%22%3D%22stop%5Fposition%22%5D%28area%2Earea%29%3Bnode%5B%22public%5Ftransport%22%3D%22platform%22%5D%28area%2Earea%29%3Bway%5B%22public%5Ftransport%22%3D%22platform%22%5D%28area%2Earea%29%3B%29%3Bout%20body%3B%3E%3Bout%20skel%3B" #path = 'stops.xml' #urllib.urlretrieve(url, path) #change path name data = parser(open('TEMP-XML.xml')) osm = data.osm nodes = osm.findAll('node') ways = osm.findAll('way') refs = [] output = [] for thing in nodes: #osmid = str(thing['id']) #lon = str(thing['lon']) #lat = str(thing['lat']) tag = thing.findAll('tag') for tag_attrs in tag: if str(tag_attrs['k']) == 'ref': ref = tag_attrs['v'] if str(tag_attrs['k']) == 'network'