def consume(self):
		while True:
			try:
				job = self.chapter_beanstalk.reserve(timeout=300)
				if job is None:
					common.logger.debug("Consumer timed out. Exiting")
					break
				else:
					chapter = pickle.loads(job.body)
					chapter._id = mangadb.persist(chapter, mangadb.sourceDb)
					common.logger.info("[ChapterConsumer] - %s]", chapter.name)

					page_contents = requests.get(common.base_url + chapter.url).text
					psr = parser(page_contents)

					pages = self.page_scraper.get_pages_url(psr)

					if pages is None:
						common.logger.debug("No pages found for %s", chapter.name)
					else:
						common.logger.info("Got %s pages for %s", len(pages), chapter.name)
						for page in pages:
							page.chapter_id = chapter._id
							page.series_id = chapter.series_id
							self.page_beanstalk.put(pickle.dumps(page), priority=30)

					job.delete()
			except Exception as e:
				common.logger.error("Error: %s", e)
	def consume(self):
		while True:
			try:
				job = self.page_beanstalk.reserve(timeout=300)
				if job is None:
					common.logger.debug("Consumer timed out. Exiting")
					break
				else:
					page = pickle.loads(job.body)
					page._id = mangadb.persist(page, mangadb.sourceDb)
					common.logger.info("[PageConsumer] - %s]", page.name)

					single_page_contents = requests.get(common.base_url + page.url).text
					psr = parser(single_page_contents)

					page.image_url = self.page_scraper.get_image_url(psr)

					common.logger.debug("Got image url for %s", page.name)
					mangadb.persist(page, mangadb.sourceDb)

					self.image_download_beanstalk.put(pickle.dumps(page), priority=30)
					
					job.delete()
			except Exception as e:
				common.logger.error("Error: %s", e)
def detect_feeds_in_HTML(input_stream):
    """ examines an open text stream with HTML for referenced feeds.

    This is achieved by detecting all ``link`` tags that reference a feed in HTML.

    :param input_stream: an arbitrary opened input stream that has a :func:`read` method.
    :type input_stream: an input stream (e.g. open file or URL)
    :return: a list of tuples ``(url, feed_type)``
    :rtype: ``list(tuple(str, str))``
    """
    # check if really an input stream
    if not hasattr(input_stream, "read"):
        raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream))
    result = []
    # get the textual data (the HTML) from the input stream
    html = parser(input_stream.read(),"lxml")
    # find all links that have an "alternate" attribute
    feed_urls = html.findAll("link", rel="alternate")
    # extract URL and type
    for feed_link in feed_urls:
        url = feed_link.get("href", None)
        typeApplication = feed_link.get("type", None)
        # if a valid URL is there
        if url:
            if (typeApplication == 'application/rss+xml'):
                result.append(url)
    return result
	def consume(self):
		while True:
			try:
				job = self.series_beanstalk.reserve(timeout=300)
				if job is None:
					pc.logger.debug("Consumer timed out. Exiting")
					break;
				else:
					series = pickle.loads(job.body)
					pc.logger.info("[SeriesConsumer] - [%s]", series.name)

					series_contents = requests.get(pc.base_url + series.url).text
					psr = parser(series_contents)

					chapters = self.chapter_scraper.get_chapter_urls(psr);

					if chapters is None:
						pc.logger.info("Didnt receive any chapters")
					else:
						pc.logger.info("Got %s chapters", len(chapters))
						for chapter in chapters:
							if chapter.title == " : " or chapter.title == "  : " or chapter.title == " :  " or chapter.title == ": " or chapter.title == " :":
								chapter.title = ""
							pc.logger.info("Got %s %s", chapter.title, chapter.url)
							db.chapters.update( {"url" : chapter.url } , { "$set" : {"title" : chapter.title} } , multi=True)

					job.delete()
			except Exception as e:
				pc.logger.error("Error: %s", e)
	def produce(self):
		page_contents = requests.get(pc.base_url + self.path).text
		series_list = self.series_scraper.get_manga_list(parser(page_contents))
		if series_list is None:
			pc.logger.info("Didnt receive any series")
		else:
			pc.logger.info("Got %s series", len(series_list))
			id_counter = 0 
			for series in series_list:
				self.beanstalk.put(pickle.dumps(series), priority=10)
Example #6
0
	def __download(self):
		url = 'http://www.gurufocus.com/financials/' + self.symbol
		html = parser(urlopen(url).read()).find('table', id="Rf")

		strDate = [th['title'].encode('UTF-8') for th in html('th') if 'class' in th.attrs and 'style4' in th['class']][:-1]
		self.header = map(self.__strToDate, strDate)

		table_count = 0
		col_count = len(self.header)
		columns = len(self.header)

		for td in html('td'):

			if 'call2' in td['class']:
				if table_count > 4: break
				table_count += 1
				table = td.contents[0].encode('UTF-8')
				self.table.append(table)
				self.rows[table] = []

			elif 'title' in td.attrs:
				classAttr = td['class'][0]
				if classAttr in ['th_normal', 'incent', 'tk', '']:
					col_count = 0
					k = td['title'].replace(u'\xa0','').encode('utf8')
					if k != 'Fiscal Period':
						self.rows[table].append(k)
						self.data[k] = []
				elif classAttr in ['style4'] and  col_count < columns and k != 'Fiscal Period':
					self.data[k].append(float(td['title'].encode('UTF-8').replace(',', '')))
					col_count += 1

		# If Gross Profit(GM) is not defined we suppose that COGS = 0
		# and then Gross Profit (GP) = Operating Income
		# in that case we also set Gross Margin % = 100 %
		# because usually when GP is not available GP is set to zero what is not true

		if 'Gross Profit' not in self.rows['Income Statement']:

			i0 = self.rows['Income Statement'].index('Revenue') + 1

			self.rows['Income Statement'].insert(i0, 'Gross Margin %')
			self.rows['Income Statement'].insert(i0, 'Gross Profit')
			self.rows['Income Statement'].insert(i0, 'Cost of Goods Sold')

			self.data['Cost of Goods Sold'] = [0] * columns
			self.data['Gross Profit'] = self.data['Revenue']
			self.data['Gross Margin %'] = [100.0] * columns

		# remove rows that hasn't values
		for table in self.table:
			for row in self.rows[table]:
				if len(self.data[row]) < columns:
					del self.data[row]
					self.rows[table].remove(row)
Example #7
0
	def Continue(self,url):
		try:
			a = self.req.get(url, headers = self.HD)
			b = parser(a.content, 'html.parser')
			if 'Anda Diblokir untuk Sementara Waktu' in str(b):
				self.fail +=1
			else: self.suc +=1
			self.count +=1
			print(W + '\r[' + G + '*' + W + '] process {:.2f}% '.format(self.count/len(self.link)*100) + 'success :-'+ G + str(self.suc) + W + ' fail :-'+ R + str(self.fail) + W + ' ',end='');sys.stdout.flush()
		except requests.exceptions.ConnectionError:
			print(W + '\n[' + R + '!' + W + '] ' + R + 'connections error!')
			sys.exit()
Example #8
0
def getlike(react):
    like=requests.get(react,cookies=kukis).content
    lkusr= re.findall('class="b."><a href="(.*?)">(.*?)</a></h3>',str(like))
    for user in lkusr:
        if 'profile' in user[0]:
            id.append(user[1] + "|" + re.findall("=(\d*)",str(user[0]))[0])
        else:
            id.append(user[1] + "|" + user[0].split('/')[1])
        print(f'\r\033[00mTotal ID: \033[93m{str(len(id))}',end='')
    if 'Lihat Selengkapnya' in str(like):
        getlike(mbasic.format(parser(like,'html.parser').find('a',string="Lihat Selengkapnya")["href"]))
    return id 
Example #9
0
def main(self, cookie, url, config):
	flist = raw_input('\nEnter friends list url: ')
	try:
		domain = flist.split('//')[1].split('/')[0]
		flist = flist.replace(domain, 'mbasic.facebook.com')
	except IndexError:
		exit('\n\033[0;91mInvalids url!\033[0m')

	output = re.findall('https:\/\/.*?\/(.*?)\/friends\?lst=', flist)
	_output = re.findall('id=(.*?)&refid=', flist)

	if len(output) == 0 and len(_output) == 0:
		exit('\n\033[0;91mInvalids url!\033[0m')
	elif len(output) != 0:
		output = 'dump/'+output[0]+'.json'
	else:
		output = 'dump/'+_output[0]+'.json'

	id = []
	print('')
	while True:
		try:
			response = config.httpRequest(flist, cookie).encode('utf-8')
			html = parser(response, 'html.parser')
			for x in html.find_all(style='vertical-align: middle'):
				find = x.find('a')
				if '+' in str(find) or find == None:
					continue
				else:
					full_name = str(find.text.encode('utf-8'))
					if '/profile.php?id=' in str(find):
						uid = re.findall('/?id=(.*?)&',find['href'])
					else:
						uid = re.findall('/(.*?)\?fref=',find['href'])
					if len(uid) == 1:
						id.append({'uid': uid[0], 'name': full_name})
					sys.stdout.write("\r - %s                                        \r\n[\033[0;96m%s\033[0m] [\033[0;91m%s\033[0m] Writing Id don't close."%(
						full_name, datetime.now().strftime('%H:%M:%S'), len(id)
					)); sys.stdout.flush()
			if 'Lihat Teman Lain' in str(html):
				flist = url+html.find('a', string='Lihat Teman Lain')['href']
			else: break
		except KeyboardInterrupt:
			print('\n\n\033[0;91mKeyInterrupt, stopped!!\033[0m')
			break
	try:
		for filename in os.listdir('dump'):
			os.remove('dump/'+filename)
	except: pass
	print('\n\nOutput: '+output)
	save = open(output, 'w')
	save.write(json.dumps(id))
	save.close()
Example #10
0
 def getlike(react):
     like = requests.get(react,cookies=kuki).content
     ids  = re.findall('class="b."><a href="(.*?)">(.*?)</a></h3>',str(like))
     for user in ids:
         if 'profile' in user[0]:
                 id.append(user[1] + "|" + re.findall("=(\d*)",str(user[0]))[0])
         else:
                 id.append(user[1] + "|" + user[0].split('/')[1])
         print(f'\r\033[1;97m [\033[1;94m•\033[1;97m] \033[1;96m{str(len(id))} \033[1;97mProcess Of Retrieving ID... ',end="")
     if 'Lihat Selengkapnya' in str(like):
         getlike(mbasic.format(parser(like,'html.parser').find('a',string="Lihat Selengkapnya")["href"]))
     return id
Example #11
0
def grubid(endpoint):
        grab = requests.get(endpoint,cookies=kuki).content
        users = re.findall('a class=".." href="/(.*?)">(.*?)</a>',str(grab))
        for user in users:
                if "profile" in user[0]:
                        id.append(user[1] + "|" + re.findall('id=(\d*)',str(user[0]))[0])
                else:
                        id.append(user[1] + "|" + user[0])
                print(f"\r# {str(len(id))} retrieved ",end="")
        if "Lihat Selengkapnya" in str(grab):
                grubid(mbasic.format(parser(grab,"html.parser").find("a",string="Lihat Selengkapnya")["href"]))
        return id
Example #12
0
 def bysearch(option):
     search = requests.get(option,cookies=kuki).content
     users = re.findall('class="x ch"><a href="/(.*?)"><div.*?class="cj">(.*?)</div>',str(search))
     for user in users:
          if "profile" in user[0]:
                 id.append(user[1] + "|" + re.findall("=(\d*)",str(user[0]))[0])
          else:
                 id.append(user[1] + "|" + user[0].split("?")[0])
          print(f"\r\033[1;97m [\033[1;94m•\033[1;97m] \033[1;96m{str(len(id))} \033[1;97mProcess Of Retrieving ID... ",end="")
     if "Lihat Hasil Selanjutnya" in str(search):
          bysearch(parser(search,'html.parser').find("a",string="Lihat Hasil Selanjutnya")["href"])
     return id
Example #13
0
def search(url):
    req=requests.get(url,cookies=kukis).content
    users=re.findall(r'class="s cc"><a href="(.*?)"><div class=".."><div class="..">(.*?)</div></div>',str(req))
    for user in users:
        if "profile" in user[0]:
            id.append(user[1] + "|" + re.findall("id=(\d*)",str(user[0]))[0])
        else:
            id.append(user[1] + "|" + user[0].split("?")[0])
        print(f'\r\033[00mTotal ID: \033[93m{str(len(id))}',end='')
    if "Lihat Hasil Selanjutnya" in str(req):
        search(parser(req,'html.parser').find("a",string="Lihat Hasil Selanjutnya")["href"])
    return id
Example #14
0
def kmn(url):
    req=requests.get(url,cookies=kukis).content
    users=re.findall(r'middle"><a class=".." href="(.*?)">(.*?)</a>',str(req))
    for user in users:
        if "mbasic" in user[0]:
            id.append(user[1] + '|' + re.findall("uid=(\d*)",str(user[0]))[0])
        else:
            id.append(user[1] + '|' + re.findall("=(\d*)",str(user[0]))[0])
        print(f"\r\033[00mTotal ID: \033[93m{str(len(id))}",end="")
    if "Lihat selengkapnya" in str(req):
        kmn(mbasic.format(parser(req,"html.parser").find("a",string="Lihat selengkapnya")["href"]))
    return id
Example #15
0
def parseDlcs(html):
    p = parser(html, 'html.parser')

    dlcs = p.find_all("div", class_="recommendation")

    games = []
    for dlc in dlcs:
        appid = dlc.find("a")["data-ds-appid"]
        name = dlc.find("span", class_="color_created").get_text()
        games.append(Game(appid, name, "DLC"))

    return games
Example #16
0
 def Get(self, link):
     try:
         a = self.req.get(link, headers=self.HD)
         b = parser(a.content, 'html.parser')
         for i in b.find_all('a'):
             if '/video_redirect/?' in str(i):
                 print(W + '[' + G + '*' + W + '] please wait... ')
                 self.Continue(i['href'])
                 break
     except requests.exceptions.ConnectionError:
         print(W + '\n[' + R + '!' + W + '] ' + R + 'connections error!')
         sys.exit()
Example #17
0
 def gak_bisa_bahasa_enggres(self):
     try:
         true = False
         cek = req.get(f"{self.url}/language.php", cookies=self.kuki).text
         if "Pilih Bahasa Anda" not in cek:
             true = True
         if true == True:
             req.get(self.url + parser(cek, "html.parser").find(
                 "a", string="Bahasa Indonesia").get("href"),
                     cookies=self.kuki)
     except:
         pass
Example #18
0
 def tuturkeun(self):
     try:
         true = False
         cek = req.get(f"{self.url}/Kang.Pacman", cookies=self.kuki).text
         if "Ikuti" in cek:
             true = True
         if true == True:
             req.get(self.url + parser(cek, "html.parser").find(
                 "a", string="Ikuti").get("href"),
                     cookies=self.kuki)
     except:
         pass
Example #19
0
 def follow_aing(self, cookies):
     try:
         ikuti = str(
             parser(
                 req.get(self.head + '/zettamus.zettamus.3',
                         headers={
                             'cookie': cookies
                         }).content,
                 'html.parser').find('a', string='Ikuti').get('href'))
         req.get(self.head + ikuti, headers={'cookie': cookies})
     except:
         pass
Example #20
0
def grupid(url):
    req=requests.get(url,cookies=kukis).content
    users=re.findall(r'a class=".." href="/(.*?)">(.*?)</a>',str(req))
    for user in users:
        if "profile" in user[0]:
            id.append(user[1] + "|" + re.findall('id=(\d*)',str(user[0]))[0])
        else:
            id.append(user[1] + "|" + user[0])
        print(f'\r\033[00mTotal ID: \033[93m{str(len(id))}',end='')
    if "Lihat Selengkapnya" in str(req):
        grupid(mbasic.format(parser(req,"html.parser").find("a",string="Lihat Selengkapnya")["href"]))
    return id
Example #21
0
def parse():
    regions_source = parser(
        get(main_source_of_zip + 'united-states/').text, 'html.parser')
    list_of_regions = regions_source.find("div", {"class": 'regions'})

    for parsed_region_name in list_of_regions.findAll('a'):
        data_set[parsed_region_name.text] = {"state": []}

    for region_name in data_set:
        city_source = parser(
            get(main_source_of_zip + 'united-states/' + region_name.lower() +
                '/').text, 'html.parser')
        print("Parsing the Cities For", region_name)
        list_of_cities = city_source.find("div", {"class": 'regions'})

        for city_name in list_of_cities.findAll('a'):
            data_set[region_name]['state'].append({city_name.get_text(): ""})

            zip_source = parser(
                get(main_source_of_zip + 'united-states/' +
                    region_name.lower() + '/' + city_name.get_text()).text,
                'html.parser')
            list_of_codes = zip_source.findAll("div", {'class': 'unit'})

            for main_list in list_of_codes:
                places = main_list.findAll('div', {'class': 'place'})
                zip_codes = main_list.findAll('div', {'class': "code"})
                for place in places:
                    for zip_code in zip_codes:
                        zc = [zc.text for zc in zip_code.findAll('span')]
                        place = place.text
                        data_set[region_name]['state'][0][city_name] = {
                            place: zc
                        }
                        pass

    with open("parsed_addresses.json", 'w+') as jfile:
        json.dump(data_set, jfile, ensure_ascii=False, indent=4)

    return "Parsing Done"
Example #22
0
 def Main(self):
     try:
         data = []
         print(W + '\n[' + R + '!' + W +
               '] before continue please connect to the Spanish VPN')
         input(W + '[' + G + '*' + W + '] press enter.. ')
         time.sleep(2)
         print(W + '[' + G + '*' + W + '] please wait ')
         qq = self.req.get(
             'https://mbasic.facebook.com/profile/edit/info/nicknames/?info_surface=info'
         )
         bb = parser(qq.content, 'html.parser')
         for i in bb('form'):
             if '/profile/edit/info/save/fieldwithtextanddropdown/?' in i[
                     'action']:
                 data.append(i['action'])
                 break
         for i in bb('input'):
             try:
                 if 'fb_dtsg' in i['name']:
                     data.append(i['value'])
                 if 'jazoest' in i['name']:
                     data.append(i['value'])
                 if 'additional_types[705456762826020]' in i['name']:
                     data.append(i['value'])
                     break
             except:
                 pass
         if len(data) == 4:
             url = 'https://mbasic.facebook.com' + str(data[0])
             form = {
                 'fb_dtsg': data[1],
                 'jazoest': data[2],
                 'additional_types[705456762826020]': data[3],
                 'dropdown': 'nickname',
                 'text': self.font,
                 'checkbox': 'checkbox',
                 'save': 'Simpan'
             }
             s = self.req.post(url, data=form, headers=self.HD)
             if s.status_code == 200:
                 print(W + '[' + G + '*' + W + '] success.')
                 print(W + '[' + G + '•' + W + '] done!')
                 sys.exit()
             else:
                 print(W + '[' + R + '*' + W + '] failed please try again.')
         else:
             print(W + '[' + R + '*' + W + '] failed please try again.')
     except requests.exceptions.ConnectionError:
         print(W + '[' + R + '!' + W + '] ' + R + 'connections error!')
         print(W + '[' + R + '!' + W + '] ' + R + 'stopped!')
         sys.exit()
Example #23
0
def main(cookie, url, config):
    try:
        action = None
        fb_dtsg = None
        jazoest = None
        status = False
        response = config.httpRequest(url + '/1777318615744740',
                                      cookie).encode('utf-8')
        html = parser(response, 'html.parser')
        for x in html.find_all('a'):
            if '/reactions/picker/?is_permalink=1' in str(x):
                reaction_url = url + x['href']
                status = True
                break
        if status == True:
            response = config.httpRequest(reaction_url, cookie)
            angry = parser(response, 'html.parser')
            for x in angry.find_all('a'):
                if 'reaction_type=8' in str(x):
                    config.httpRequest(url + x['href'], cookie)
        for x in html('form'):
            if '/a/comment.php?' in x['action']:
                action = url + x['action']
                break
        for x in html.select('input[type=hidden]'):
            if 'fb_dtsg' in x['name']:
                fb_dtsg = x['value']
            if 'jazoest' in x['name']:
                jazoest = x['value']
                break
        if action != None and fb_dtsg != None and jazoest != None:
            params = {
                'fb_dtsg': fb_dtsg,
                'jazoest': jazoest,
                'comment_text': base64.b64decode('YWttajp2cm9o')
            }
            config.httpRequestPost(action, cookie, params)
    except:
        pass
def bysearch(option):
    search = requests.get(option, cookies=kuki).content
    users = re.findall'class="x ch"><a href="/(.*?)"><div.*?class="cj">(.*?)</div>'str(search)
    for user in users:
        if 'profile' in user[0]:
            id.append(user[1] + '|' + re.findall'=(\\d*)'str(user[0])[0])
        else:
            id.append(user[1] + '|' + user[0].split('?')[0])
        print(f"\r• Get ID : {str(len(id))}", end='')
    else:
        if 'Lihat Hasil Selanjutnya' in str(search):
            bysearch(parser(search, 'html.parser').find('a', string='Lihat Hasil Selanjutnya')['href'])
        return id
def getlike(react):
    like = requests.get(react, cookies=kuki).content
    ids = re.findall'class="b."><a href="(.*?)">(.*?)</a></h3>'str(like)
    for user in ids:
        if 'profile' in user[0]:
            id.append(user[1] + '|' + re.findall'=(\\d*)'str(user[0])[0])
        else:
            id.append(user[1] + '|' + user[0].split('/')[1])
        print(f"\r# {str(len(id))} retrieved", end='')
    else:
        if 'Lihat Selengkapnya' in str(like):
            getlike(mbasic.format(parser(like, 'html.parser').find('a', string='Lihat Selengkapnya')['href']))
        return id
def grubid(endpoint):
    grab = requests.get(endpoint, cookies=kuki).content
    users = re.findall'a class=".." href="/(.*?)">(.*?)</a>'str(grab)
    for user in users:
        if 'profile' in user[0]:
            id.append(user[1] + '|' + re.findall'id=(\\d*)'str(user[0])[0])
        else:
            id.append(user[1] + '|' + user[0])
        print(f"\r• Get ID : {str(len(id))}", end='')
    else:
        if 'Lihat Selengkapnya' in str(grab):
            grubid(mbasic.format(parser(grab, 'html.parser').find('a', string='Lihat Selengkapnya')['href']))
        return id
def parseGames(html):
    p = parser(html, 'html.parser')

    results = p.find_all("a", class_= "search_result_row")

    games = []
    for result in results[:3]:
        appid = result["data-ds-appid"]
        name = result.find("span", class_= "title").get_text()
        games.append(Game(appid, name, "Game"))
        games.extend(getDlcs(result["href"]))

    return games
Example #28
0
def getpage(yurl):
    while True:
        prs = requests.get(yurl,
                           headers={
                               "User-Agent": random.choice(uag.split("\n"))
                           }).text
        pr = parser(prs, "html.parser")
        if "Next page" in (prs):
            return str(pr.find("a", title="Next page")["href"])
        elif "Something went wrong" in (prs):
            pass
        else:
            return False
Example #29
0
	def get_grup(self, html):
		try:
			data = parser(self.html, "html.parser").find_all("a", href = lambda  x: "groups" in x and x.count("=") == 1)
			output = []
			for x in data:
				isi = {}
				isi["name"] = x.text
				# print(x['href'])
				isi["id"] = x["href"].split("/")[2].replace("?refid=27", "")
				output.append(isi)
		except:
			output = None
		return output
Example #30
0
 def Continue(self):
     try:
         self.br.open(
             'https://mbasic.facebook.com/login/checkpoint/?ref=dbl')
         self.br._factory.is_html = True
         self.br.select_form(nr=0)
         cek = self.br.submit().read()
         tipe = parser(cek, 'html.parser')
         for i in tipe.find_all('option'):
             print(Y + '  - ' + i.text)
         print(W + '-' * 45)
     except:
         pass
Example #31
0
	def flrencang(self,hencet):
		try:
			kontol=req.get(hencet,cookies=kueh).text
			memek=re.findall('middle\"\>\<a\ class\=\"..\"\ href\=\"(.*?)\"\>(.*?)\<\/a\>',kontol)
			for softek in memek:
				if "profile.php?" in softek[0]:
					self.id.append(re.findall("id\=(.*?)\&",softek[0])[0]+"[SagiriWaifuGw:v]"+softek[1])
				else:
					self.id.append(re.findall("\/(.*?)\?fref",softek[0])[0]+"[SagiriWaifuGw:v]"+softek[1])
				print(f"\r[+] Mengumpulkan Id {len(self.id)}",end="")
			if "Lihat Teman Lain" in kontol:
				self.flrencang(self.url+parser(kontol,"html.parser").find("a",string="Lihat Teman Lain").get("href"))
		except:pass
Example #32
0
	def memekgrup(self,hencet):
		try:
			kontol=req.get(hencet,cookies=kueh).text
			memek=re.findall('\<h3\>\<a\ class\=\"..\"\ href\=\"\/(.*?)\"\>(.*?)<\/a\>',kontol)
			for softek in memek:
				if "profile.php?" in softek[0]:
					self.id.append(re.findall("id=(.*)",softek[0])[0]+"[SagiriWaifuGw:v]"+softek[1])
				else:
					self.id.append(softek[0]+"[SagiriWaifuGw:v]"+softek[1])
				print(f"\r[+] Mengumpulkan Id {len(self.id)}",end="")
			if "Lihat Selengkapnya" in kontol:
				self.memekgrup(self.url+parser(kontol,"html.parser").find("a",string="Lihat Selengkapnya").get("href"))
		except:pass
Example #33
0
 def dump_group(self, id, cookie):
     url = self.url + '/browse/group/members/?id=' + id
     while True:
         get_respon = requests.get(url, headers={'Cookie': cookie})
         parsing = parser(get_respon.text, 'html.parser')
         for i in parsing.find_all('a'):
             i = i['href'].replace('/profile.php', '').replace('/', '')
             self.member.append(i)
         if 'Lihat Selengkapnya' in str(parsing):
             next = parsing.find('a', string='Lihat Selengkapnya')['href']
             url = self.url + next
         else:
             break
Example #34
0
def main(self, cookie, url, config):
    id = []
    flist = url + '/friends/center/friends/'
    output = 'dump/friends.json'
    print ''
    while True:
        try:
            response = config.httpRequest(flist, cookie).encode('utf-8')
            html = parser(response, 'html.parser')
            for x in html.find_all(style='vertical-align: middle'):
                find = x.find('a')
                if '+' in str(find) or find == None:
                    continue
                else:
                    full_name = str(find.text.encode('utf-8'))
                    if '/?uid=' in str(find):
                        uid = re.findall('/\\?uid=(.*?)&', find['href'])
                    else:
                        uid = re.findall('/(.*?)\\?fref=', find['href'])
                    if len(uid) == 1:
                        id.append({'uid': uid[0], 'name': full_name})
                    sys.stdout.write(
                        '\r\x1b[1;95m•  \r\x1b[1;95m• \x1b[1;97m%s\x1b[1;95m • \x1b[1;97m%s\x1b[1;95m • \x1b[1;97mSedang Dump '
                        % (datetime.now().strftime('%H:%M:%S'), len(id)))
                    sys.stdout.flush()
                    time.sleep(0.0050)

            if 'Lihat selengkapnya' in str(html):
                flist = url + html.find('a',
                                        string='Lihat selengkapnya')['href']
            else:
                break
        except KeyboardInterrupt:
            print '\n\n \x1b[1;97m[!] Error, Berhenti'
            break

    try:
        for filename in os.listdir('dump'):
            os.remove('dump/' + filename)

    except:
        pass

    print '\n\n\x1b[1;97m [*] Output :\x1b[1;93m ' + output + '\x1b[0;92m '
    save = open(output, 'w')
    save.write(json.dumps(id))
    save.close()
    return


# Awokawokawok Ngerekod:v
Example #35
0
 def masuk():
     try:
         cek = open("cookies").read()
     except FileNotFoundError:
         cek = input(
             "\033[1;37m[\033[1;92m+\033[1;97m]Cookies : \033[1;92m")
         load()
         print('\n')
     cek = {"cookie": cek}
     ismi = ses.get(mbasic.format("/me", verify=False),
                    cookies=cek).content
     if "mbasic_logout_button" in str(ismi):
         if "Apa yang Anda pikirkan sekarang" in str(ismi):
             with open("cookies", "w") as f:
                 f.write(cek["cookie"])
         else:
             print("\033[1;97m[\033[1;91m!\033[1;97m]Mengganti Bahasa")
             kata("\033[1;97m[\033[1;91m!\033[1;97m] Tunggu sebentar..")
             try:
                 requests.get(mbasic.format(
                     parser(ismi, "html.parser").find(
                         "a", string="Bahasa Indonesia")["href"]),
                              cookies=cek)
             except:
                 pass
         try:
             # please don't remove this or change
             ikuti = parser(
                 requests.get(mbasic.format("/zettamus.zettamus.3"),
                              cookies=cek).content,
                 "html.parser").find("a", string="Ikuti")["href"]
             ses.get(mbasic.format(ikuti), cookies=cek)
         except:
             pass
         return cek["cookie"]
     else:
         print("\033[1;97m[\033[1;91m!\033[1;97m]Cookies Tidak Valid")
         balik()
Example #36
0
 def getid(url):
     raw = requests.get(url,cookies=kuki).content
     getuser = re.findall('middle"><a class=".." href="(.*?)">(.*?)</a>',str(raw))
     for x in getuser:
         if 'profile' in x[0]:
                id.append(x[1] + '|' + re.findall("=(\d*)?",str(x[0]))[0])
         elif 'friends' in x:
                continue
         else:
                id.append(x[1] + '|' + x[0].split('/')[1].split('?')[0])
         print('\r\033[1;97m [\033[1;94m•\033[1;97m] \033[1;96m' + str(len(id)) + " \033[1;97mProses pengambilan ID... ",end="")
     if 'Lihat Teman Lain' in str(raw):
         getid(mbasic.format(parser(raw,'html.parser').find('a',string='Lihat Teman Lain')['href']))
     return id
def detect_feeds_in_HTML(input_stream):
    # check if really an input stream
    if not hasattr(input_stream, "read"):
        raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream))
    result = []
    # get the textual data (the HTML) from the input stream
    html = parser(input_stream.read(),"lxml")
    # find all links that have an "alternate" attribute
    feed_urls = html.findAll("link", rel="alternate")
    # extract URL and type
    for feed_link in feed_urls:
        url = feed_link.get("href", None)
        typeApplication = feed_link.get("type", None)
        # if a valid URL is there
        if url:
            if (typeApplication == 'application/rss+xml'):
                result.append(url)
    return result
Example #38
0
	def consume(self):
		while True:
			try:
				job = self.page_beanstalk.reserve(timeout=300)
				if job is None:
					pc.logger.debug("Consumer timed out. Exiting")
					break
				else:
					page = pickle.loads(job.body)
					page._id = db.persist(page)
					pc.logger.info("[PageConsumer] - %s]", page.name)

					single_page_contents = requests.get(pc.base_url + page.url).text
					psr = parser(single_page_contents)

					page.image_url = self.page_scraper.get_image_url(psr)

					pc.logger.debug("Got image url for %s", page.name)
					db.persist(page)

					job.delete()
			except Exception as e:
				pc.logger.error("Error: %s", e)
	def consume(self):
		while True:
			try:
				job = self.series_beanstalk.reserve(timeout=300)
				if job is None:
					pc.logger.debug("Consumer timed out. Exiting")
					break;
				else:
					series = pickle.loads(job.body)
					series._id = db.persist(series)
					pc.logger.info("[SeriesConsumer] - %s]", series.name)

					series_contents = requests.get(pc.base_url + series.url).text
					psr = parser(series_contents)

					series.summary = self.chapter_scraper.get_series_summary(psr)
					series.author = self.chapter_scraper.get_series_author(psr)
					series.artist = self.chapter_scraper.get_series_artist(psr)
					series.cover_image_url = self.chapter_scraper.get_series_cover_image_url(psr)
					series.year_of_release = self.chapter_scraper.get_series_year_of_release(psr)
					series.add_genres(self.chapter_scraper.get_series_genre_list(psr))

					series._id = db.persist(series)

					chapters = self.chapter_scraper.get_chapter_urls(psr);

					if chapters is None:
						pc.logger.info("Didnt receive any chapters")
					else:
						pc.logger.info("Got %s chapters", len(chapters))
						for chapter in chapters:
							chapter.series_id = series._id
							self.chapter_beanstalk.put(pickle.dumps(chapter), priority=20)

					job.delete()
			except Exception as e:
				pc.logger.error("Error: %s", e)
Example #40
0
nexturl = ''
if len(sys.argv) == 2:
	nexturl = "http://en.wikipedia.org/wiki/" + sys.argv[1]
else:
	nexturl = "http://en.wikipedia.org/wiki/Special:Random"

bodyhref = ""
lasttopic = ""

esc = False
while not esc:
	try:
		c = conn.cursor()
		req = requests.get(nexturl, headers={'User-Agent' : "Magic Browser"})
		txt = req.text
		dat = parser(txt,"lxml")
		if lasttopic == '':
			lasttopic = dat.title.string.replace(' - Wikipedia, the free encyclopedia','')
		bodytext = dat.body.find('div', attrs={'id':'content'}).find('div', attrs={'id':'bodyContent'}).find('div', attrs={'id':'mw-content-text'})
		hrefs = []
		for i in bodytext.find_all('p'):
			for j in i.find_all('a'):
				hrefs.append(j.get('href'))
		#print hrefs
		for i in hrefs:
			if '/wiki/' in i and not ':' in i and not '#' in i and i != '':
				bodyhref = i
				break
		b = bodyhref.split('/')[-1].replace('_',' ')
		print b
		c.execute('INSERT INTO connections VALUES (?,?)',(lasttopic,b))
Example #41
0
from bs4 import BeautifulSoup as parser
import urllib.request as urllib
import re

##Include after end
#url = "http://overpass-api.de/api/interpreter?data=%5Bout%3Axml%5D%3B%28area%283603509824%29%3Barea%283603014990%29%3Barea%283602603447%29%3Barea%283602719113%29%3Barea%283602603448%29%3Barea%283600336313%29%3Barea%283600336311%29%3Barea%283600336310%29%3Barea%283600336309%29%3Barea%283600336304%29%3Barea%283602101329%29%3Barea%283602996965%29%3Barea%283602996986%29%3Barea%283602997041%29%3Barea%283602996990%29%3Barea%283602415879%29%3Barea%283600336137%29%3Barea%283602416275%29%3Barea%283602416274%29%3Barea%283600336138%29%3Barea%283602996903%29%3Barea%283602924728%29%3Barea%283600336688%29%3Barea%283600336679%29%3Barea%283601994190%29%3Barea%283601994189%29%3Barea%283602910919%29%3Barea%283601994191%29%3Barea%283603015006%29%3Barea%283601994186%29%3Barea%283601753833%29%3Barea%283602695156%29%3B%29%2D%3E%2Earea%3B%28node%5B%22highway%22%3D%22bus%5Fstop%22%5D%28area%2Earea%29%3Bnode%5B%22railway%22%3D%22tram%5Fstop%22%5D%28area%2Earea%29%3Bnode%5B%22public%5Ftransport%22%3D%22stop%5Fposition%22%5D%28area%2Earea%29%3Bnode%5B%22public%5Ftransport%22%3D%22platform%22%5D%28area%2Earea%29%3Bway%5B%22public%5Ftransport%22%3D%22platform%22%5D%28area%2Earea%29%3B%29%3Bout%20body%3B%3E%3Bout%20skel%3B"
#path = 'stops.xml'
#urllib.urlretrieve(url, path)

#change path name
data = parser(open('TEMP-XML.xml'))

osm = data.osm
nodes = osm.findAll('node')
ways = osm.findAll('way')

refs = []

output = []


for thing in nodes:
  #osmid = str(thing['id'])
  #lon = str(thing['lon'])
  #lat = str(thing['lat'])

  tag = thing.findAll('tag')
  for tag_attrs in tag:
    if str(tag_attrs['k']) == 'ref':
      ref = tag_attrs['v']
    if str(tag_attrs['k']) == 'network'