def test_divine_interp_tag(self): # HD1 elements HD = bS('<HD1>Introduction\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'intro') HD = bS('<HD1>Appendix X - X-rays\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'appendix') HD = bS('<HD1>Appendices G & H - Cane\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'appendices') HD = bS('<HD1>Section 1002.4 - Known\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section') HD = bS('<HD1>Inevitable Random HD1\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), '') # HD2 elements HD = bS('<HD2>Section 1002.4 - Known\n</HD2>', 'lxml-xml').find('HD2') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section') HD = bS('<HD2>2(b) Application\n</HD2>', 'lxml-xml').find('HD2') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'graph_id') # HD3 elements HD = bS('<HD3>Section 1002.4 - Known\n</HD3>', 'lxml-xml').find('HD3') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section') HD = bS('<HD3>2(b) Application\n</HD3>', 'lxml-xml').find('HD3') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'graph_id') HD = bS('<HD3>(b) Application\n</HD3>', 'lxml-xml').find('HD3') self.assertEqual( divine_interp_tag_use(HD, '1030'), 'graph_id_inferred_section')
def test_divine_interp_tag(self): # HD1 elements HD = bS('<HD1>Introduction\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'intro') HD = bS('<HD1>Appendix X - X-rays\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'appendix') HD = bS('<HD1>Appendices G & H - Cane\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'appendices') HD = bS('<HD1>Section 1002.4 - Known\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section') HD = bS('<HD1>Inevitable Random HD1\n</HD1>', 'lxml-xml').find('HD1') self.assertEqual(divine_interp_tag_use(HD, '1002'), '') # HD2 elements HD = bS('<HD2>Section 1002.4 - Known\n</HD2>', 'lxml-xml').find('HD2') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section') HD = bS('<HD2>2(b) Application\n</HD2>', 'lxml-xml').find('HD2') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'graph_id') # HD3 elements HD = bS('<HD3>Section 1002.4 - Known\n</HD3>', 'lxml-xml').find('HD3') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section') HD = bS('<HD3>2(b) Application\n</HD3>', 'lxml-xml').find('HD3') self.assertEqual(divine_interp_tag_use(HD, '1002'), 'graph_id') HD = bS('<HD3>(b) Application\n</HD3>', 'lxml-xml').find('HD3') self.assertEqual( divine_interp_tag_use(HD, '1030'), 'graph_id_inferred_section')
def test_parse_interp_graph_reference(self): valid_graph_element = bS("<HD3>Paragraph 2(c)(1)</HD3>", 'lxml-xml') self.assertEqual( parse_interp_graph_reference(valid_graph_element, '1002', '2'), "2-c-1-Interp") invalid_graph_element = bS("<HD3>Paragraph X(5)(a)</HD3>", 'lxml-xml') self.assertEqual( parse_interp_graph_reference(invalid_graph_element, '1002', '2'), "") valid_inferred_section_graph_element = bS( "<HD3>Paragraph (c)(1)</HD3>", 'lxml-xml') self.assertEqual( parse_interp_graph_reference( valid_inferred_section_graph_element, '1030', '2'), "2-c-1-Interp")
def test_parse_interp_graph_reference(self): valid_graph_element = bS("<HD3>Paragraph 2(c)(1)</HD3>", 'lxml-xml') self.assertEqual(ecfr_importer.parse_interp_graph_reference( valid_graph_element, '1002', '2'), "2-c-1-Interp") invalid_graph_element = bS("<HD3>Paragraph X(5)(a)</HD3>", 'lxml-xml') self.assertEqual(ecfr_importer.parse_interp_graph_reference( invalid_graph_element, '1002', '2'), "") valid_inferred_section_graph_element = bS( "<HD3>Paragraph (c)(1)</HD3>", 'lxml-xml') self.assertEqual( ecfr_importer.parse_interp_graph_reference( valid_inferred_section_graph_element, '1030', '2'), "2-c-1-Interp")
def index(): if request.method == 'POST': try: searchstring = request.form['content'].replace(" ", "%20") flipkart_url = "https://www.flipkart.com/search?q=" + searchstring uclient = uReq(flipkart_url) flipkartpage = uclient.read() uclient.close() flipkart_html = bS(flipkartpage, "html.parser") bigboxes = flipkart_html.findAll("div", {"class": "bhgxx2 col-12-12"}) del bigboxes[0:3] del bigboxes[-5:] alldata = {} for boxes in bigboxes: productname = boxes.find("div", {"class": "_3wU53n"}) productLinks = "https://www.flipkart.com" + boxes.div.div.div.a[ 'href'] alldata[productname.text] = productLinks with open('url.txt', 'w') as file: #dumping the url dict in a txt file file.write(json.dumps(alldata)) return render_template("links.html", alldata=alldata) except Exception as e: print('The Exception message is: ', e) return 'something is wrong' return render_template("links.html") else: return render_template("index.html")
def get_url_content(self, news_source): """ Returns the content of the called RSS Feed We might need this part in order to verify the validity of the request user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' headers = {'User-Agent':user_agent} But I am not sure how to incorporate this into the feedparser. :param news_source: nos, rtl or nu :return: Beautiful soup object that contains all the text that is being returned.. """ import feedparser # Usefull to parse RSS feeds from helper.miscfunction import dict_find from bs4 import BeautifulSoup as bS url_label = self.get_label(news_source) url_list = self.get_url_feed(news_source) rss_text = [] for i_label, i_url in zip(url_label, url_list): url_content = feedparser.parse(i_url) rss_html_text = list(dict_find('value', url_content)) for i_html in rss_html_text: process_html = bS(i_html, 'lxml').text.replace('\n', ' ') rss_text.append((news_source + '-' + i_label, process_html)) return rss_text
def getresponse(self): self.response = requests.get(self.translatelink, headers={'User-Agent': 'Mozilla/5.0'}) soup = bS(self.response.text, 'html.parser') b = [] # This piece looks for the first 2 translations: # This next piece looks for remaining translations in <a> tag: a = soup.find_all('a', class_='translation') for i in a: if i.text: word = i.text.replace("\n", "") word = word.lstrip() if word != 'Translation': b.append(word) # And this piece searches for remaining translations in <div> tag: a = soup.find_all('div', class_='translation') for i in a: if i.text: word = i.text.replace("\n", "") word = word.lstrip() b.append(word) self.translations = b # This piece searches the all translations and examples: examp = soup.find_all("div", {"class": {"src ltr", "trg ltr"}}) res = [] for i in examp: a = i.text.strip() res.append(a) self.examples = res
def grabcharsfromfile(htmlfile: str, cur_file: int) -> List[Character]: charfile = open(htmlfile) char_bs = bS(charfile, "lxml") charfile.close() chartable = char_bs.table # All characters exist in the same table characterblocks = chartable.findAll("p") numchars = len(characterblocks) // 4 curspot = 0 characters = [] for x in range(0, numchars): mark = characterblocks[curspot].contents[1].string curspot += 1 charauthorblock = characterblocks[curspot].string if len(charauthorblock) > 4: # we have an author author = charauthorblock[4::] else: author = "" curspot += 1 charworkblock = characterblocks[curspot].string if len(charworkblock) > 3: work = charworkblock[3::] else: work = "" curspot += 1 characterblocks[curspot].unwrap() filepathname = characterblocks[curspot].attrs['id'] work_id = filepathname[:8] page_id = filepathname[18:26] coords = splitcoordinates(filepathname[18:]) curspot += 1 characters.append(Character(mark, author, work, work_id, page_id, coords)) return characters,
def parse(self): for i in range(len(self.source)): soup = bS(self.source[i], features='lxml') content = soup.find_all("li", class_="") ref_ = [ c.get('data-id') for c in content if c.get('data-id') is not None ] for j in range(len(ref_)): data = soup.find(attrs={'data-id': '{}'.format(ref_[j])}) ref = ref_[j] topic_ = data.find("a", class_="lien-jv topic-title") topic = topic_.text.strip() t_link = os.environ.get('BASE_') + topic_.get('href') count = data.find("span", class_="topic-count").text.strip() time = data.find("span", class_="topic-date").text.strip() today = date.today() if hour.match(time): date_ = str(today) + 'T' + str(time) else: date_ = parse(time) topics = {'topic': topic, 'topic_link': t_link, 'ref': ref, 'keywords': self.search, 'post_count': int(count), 'indexed': 0, 'last_msg_date': date_, 'collection_date': datetime.now()} ES_.index(index=os.environ.get('INDEX_'), id=ref, body=topics) self.counter.append(1) self.refs.extend(ref_) CL_.close() self.update_index()
def get_price_from_flip_kart(link, class_name): page = urllib.request.urlopen(link) soup = bS(page, features="html.parser") value = soup.find('div', class_=class_name).string flip_kart_price = int(value[1::]) return flip_kart_price
def fetch(this): artciles = [] while this.url != False: response = requests.get(this.url) data = bS(response.text, "html.parser") cards = data.select(".card") for card in cards: emoji = card.select_one(".emoji").text textCard = card.select_one(".card-text").text headerText = card.select("span")[1].text img = urljoin(this.url,card.select_one("img").attrs['src']) site = this.url newArticle = Article(headerText,emoji,textCard,img,site) artciles.append(newArticle) # End for card in cards nextSiteExists = data.select_one(".navigation a") if nextSiteExists != None: nextPage = urljoin(this.url,nextSiteExists.attrs['href']) this.url = nextPage else: this.url = False return artciles
async def get_election_offices(): """Starting point of the scraper program. Scrapes BASE_URL for election office information and both dumps results to a .json file and returns the results as json. @return: list of scraped results as json. """ # Get list of county names from registrar to populate form # Define coroutine functions (context managers) async with CloudflareScraper() as session: async with session.get(BASE_URL) as s: # ClientResponse.read() is a coroutine function so it must be awaited text = await s.read() soup = bS(text, "html5lib") info_list = soup.findAll("area") counties = [info['alt'] for info in info_list] county_urls = [info['href'] for info in info_list] # Use list of counties and IDs to get county info for each county tasks: List[Task] = [] num_scraped = 0 master_list = [] for i in range(len(counties)): # Create task for a future asynchronous operation and store it in task list tasks.append( asyncio.create_task( scrape_one_county(session, counties[i], county_urls[i]))) # Run the coroutines and iterate over the yielded results as they complete # (out-of-order). Use asyncio.gather() with a couple code modifications to # preserve list order future: Future[Tuple[str, str, str, str, str]] for future in asyncio.as_completed(tasks): # Unpack awaited result of scrape_one_county() ( address, county_website, phone_number, email_address, county_name, ) = await future schema = format_data_into_schema( address, county_website, phone_number, email_address, county_name, ) master_list.append(schema) num_scraped += 1 print(f"[New York] Scraped {county_name} county: " f"#{num_scraped} of {len(counties)} .... " f"[{round((num_scraped / len(counties)) * 100, 2)}%]") master_list = sorted(master_list, key=lambda county: county['countyName']) with open(os.path.join(ROOT_DIR, "scrapers", "new_york", "new_york.json"), "w") as f: json.dump(master_list, f) return master_list
def test_parse_appendix_elements_with_interp_ref(self): from regulations3k.scripts.ecfr_importer import PAYLOAD PAYLOAD.interp_refs.update( {'1002-A': {'1': 'see(1002-A-1-Interp)'}}) p_soup = bS(self.test_xml, 'lxml-xml') appendix = p_soup.find('DIV5').find('DIV9') parsed_appendix = parse_appendix_elements(appendix, '1002-A') self.assertIn("see(1002-A-1-Interp)", parsed_appendix)
def test_set_table(self): PAYLOAD.reset() p_soup = bS(self.test_xml, 'lxml-xml') appendix = p_soup.find('DIV5').find('DIV9') table = appendix.find('TABLE') table_id = 'table-A-0' ecfr_importer.set_table(table, table_id) self.assertIn(table_id, PAYLOAD.tables.keys()) self.assertTrue(isinstance(PAYLOAD.tables[table_id], RegTable))
def test_set_table(self): PAYLOAD.reset() p_soup = bS(self.test_xml, 'lxml-xml') appendix = p_soup.find('DIV5').find('DIV9') table = appendix.find('TABLE') table_id = 'table-A-0' ecfr_importer.set_table(table, table_id) self.assertIn(table_id, PAYLOAD.tables.keys()) self.assertTrue(isinstance(PAYLOAD.tables[table_id], RegTable))
def list_sub_ids(start): url = 'https://movie.douban.com/top250' params = {"start": start} response = requests.get(url, params=params, headers=headers) assert response.status_code == 200 soup = bS(response.text, 'html.parser') items = soup.find_all('div', 'item') return list(map(get_id_from_item, items))
def test_parse_appendix_elements_with_interp_ref(self): PAYLOAD.interp_refs.update({'A': {'1': 'see(A-1-Interp)'}}) p_soup = bS(self.test_xml, 'lxml-xml') appendix = p_soup.find('DIV5').find('DIV9') parsed_appendix = ecfr_importer.parse_appendix_elements(appendix, 'A') self.assertIn("{1}", parsed_appendix) self.assertIn("(print or type):__", parsed_appendix) self.assertIn("<table>", parsed_appendix) self.assertIn("![image-A-1]", parsed_appendix) self.assertIn("{table-A-0}", PAYLOAD.tables.keys())
async def get_election_offices(): async with aiohttp.ClientSession() as session: async with session.get(URL) as r: text = await r.read() soup = bS(text.decode("utf-8"), "html.parser") elems = soup.find_all("td") master_list = [] for e in elems: text = [i.strip() for i in e.get_text('\n').split('\n') if i.strip()] if not text: continue county = text[0] clerk = text[1].split(":")[-1].strip() email = text[ 2] if county != "Daggett" else "*****@*****.**" street_number_name = text[3] if 'UT' in text[ 4] else f"{text[3]}, {text[4]}" city = text[-3].split(",")[0] zip_code = text[-3].split()[-1] phone = text[-2].split(":")[-1].strip() subschema = format_address_data(street_number_name, county) schema = { "countyName": county, "physicalAddress": { "city": city, "state": "Utah", "zipCode": zip_code, "locationName": subschema["locationName"], }, "phone": phone, "email": email, "officeSupervisor": clerk, "website": URL, } if "poBox" in subschema: schema["physicalAddress"]["poBox"] = subschema["poBox"] if "aptNumber" in subschema: schema["physicalAddress"]["aptNumber"] = subschema["aptNumber"] if "streetNumberName" in subschema: schema["physicalAddress"]["streetNumberName"] = subschema[ "streetNumberName"] master_list.append(schema) with open(os.path.join(ROOT_DIR, "scrapers", "utah", "utah.json"), "w") as f: json.dump(master_list, f) return master_list
def ecfr_to_regdown(part_number, file_path=None): """ Extract a regulation Part from eCFR XML, and create regdown content. The default XML source is the latest regulation posting at www.gpo.gov, which gets updated every few days. If `file_path` is specified, a local XML file is parsed instead. DIV1 is a title (as in Title 12) DIV3 is a chapter (not used here) DIV5 is a part DIV6 is a subpart DIV8 is a section DIV9 is an appendix DIV9 element whose HEAD starts with 'Supplement I' is an interpretation To avoid mischief, we make sure the part number is on a whitelist. """ PAYLOAD.reset() if part_number not in PART_ALLOWLIST: raise ValueError("Provided Part number is not a CFPB regulation.") starter = datetime.datetime.now() if file_path: try: with open(file_path, "r") as f: markup = f.read() except IOError: logger.info("Could not open local file {}".format(file_path)) return else: ecfr_request = requests.get(LATEST_ECFR) if not ecfr_request.ok: logger.info( "ECFR request failed with code {} and reason {}".format( ecfr_request.status_code, ecfr_request.reason ) ) return ecfr_request.encoding = "utf-8" markup = ecfr_request.text soup = bS(markup, "lxml-xml") parts = soup.find_all("DIV5") part_soup = [div for div in parts if div["N"] == part_number][0] PAYLOAD.get_effective_date(part_number) PAYLOAD.parse_part(part_soup, part_number) part = PAYLOAD.part PAYLOAD.parse_version(part_soup, part) # parse_subparts will create and associate sections and appendices parse_subparts(part_soup, part) msg = "Draft version of Part {} created.\n" "Parsing took {}".format( part_number, (datetime.datetime.now() - starter) ) return msg
def test_parse_appendices_creation(self): PAYLOAD.reset() self.assertEqual(len(PAYLOAD.appendices), 0) test_part = Part.objects.first() test_subpart = Subpart.objects.first() PAYLOAD.subparts['appendix_subpart'] = test_subpart PAYLOAD.interp_refs.update({'A': {'1': 'see(A-1-Interp)'}}) soup = bS(self.test_xml, 'lxml-xml') test_appendices = [soup.find('DIV5').find('DIV9')] ecfr_importer.parse_appendices(test_appendices, test_part) self.assertEqual(len(PAYLOAD.appendices), 1)
def ecfr_to_regdown(part_number, file_path=None): """ Extract a regulation Part from eCFR XML, and create regdown content. The default XML source is the latest regulation posting at www.gpo.gov, which gets updated every few days. If `file_path` is specified, a local XML file is parsed instead. DIV1 is a title (as in Title 12) DIV3 is a chapter (not used here) DIV5 is a part DIV6 is a subpart DIV8 is a section DIV9 is an appendix DIV9 element whose HEAD starts with 'Supplement I' is an interpretation To avoid mischief, we make sure the part number is on a whitelist. """ PAYLOAD.reset() if part_number not in PART_WHITELIST: raise ValueError('Provided Part number is not a CFPB regulation.') starter = datetime.datetime.now() if file_path: try: with open(file_path, 'r') as f: markup = f.read() except IOError: logger.info("Could not open local file {}".format(file_path)) return else: ecfr_request = requests.get(LATEST_ECFR) if not ecfr_request.ok: logger.info( "ECFR request failed with code {} and reason {}".format( ecfr_request.status_code, ecfr_request.reason)) return ecfr_request.encoding = 'utf-8' markup = ecfr_request.text soup = bS(markup, "lxml-xml") parts = soup.find_all('DIV5') part_soup = [div for div in parts if div['N'] == part_number][0] PAYLOAD.get_effective_date(part_number) PAYLOAD.parse_part(part_soup, part_number) part = PAYLOAD.part PAYLOAD.parse_version(part_soup, part) # parse_subparts will create and associate sections and appendices parse_subparts(part_soup, part) msg = ( "Draft version of Part {} created.\n" "Parsing took {}".format( part_number, (datetime.datetime.now() - starter)) ) return msg
def test_parse_appendices_creation(self): PAYLOAD.reset() self.assertEqual(len(PAYLOAD.appendices), 0) test_part = Part.objects.first() test_subpart = Subpart.objects.first() PAYLOAD.subparts['appendix_subpart'] = test_subpart PAYLOAD.interp_refs.update({'A': {'1': 'see(A-1-Interp)'}}) soup = bS(self.test_xml, 'lxml-xml') test_appendices = [soup.find('DIV5').find('DIV9')] ecfr_importer.parse_appendices(test_appendices, test_part) self.assertEqual(len(PAYLOAD.appendices), 1)
def test_appendix_graph_parsing(self): ls = IdLevelState() p_soup = bS(self.test_xml, 'lxml-xml') graphs = p_soup.find_all('DIV5')[1].find_all('DIV9')[1].find_all('P') parsed_graph2 = ls.parse_appendix_graph(graphs[2], '1002-A') self.assertIn("(2) To the extent not included in item 1 above:", parsed_graph2) parsed_graph3 = ls.parse_appendix_graph(graphs[3], '1002-A') self.assertIn("(i) National banks", parsed_graph3) ecfr_importer.parse_appendix_paragraphs(graphs, 'appendix', '1002-A') self.assertIn('\n1(a)', p_soup.text)
def test_parse_appendix_elements_with_interp_ref(self): PAYLOAD.interp_refs.update( {'A': {'1': 'see(A-1-Interp)'}}) p_soup = bS(self.test_xml, 'lxml-xml') appendix = p_soup.find('DIV5').find('DIV9') parsed_appendix = ecfr_importer.parse_appendix_elements( appendix, 'A') self.assertIn("{1}", parsed_appendix) self.assertIn("(print or type):__", parsed_appendix) self.assertIn("<table>", parsed_appendix) self.assertIn("![image-A-1]", parsed_appendix) self.assertIn("{table-A-0}", PAYLOAD.tables.keys())
def test_appendix_id_type_sniffer(self): ls = IdLevelState() p_soup = bS(self.test_xml, 'lxml-xml') appendices = p_soup.find_all('DIV5')[1].find_all('DIV9') appendix_0_graphs = appendices[0].find_all('P') appendix_0_type = ls.sniff_appendix_id_type(appendix_0_graphs) self.assertEqual('appendix', appendix_0_type) appendix_1_graphs = appendices[1].find_all('P') appendix_1_type = ls.sniff_appendix_id_type(appendix_1_graphs) self.assertEqual('section', appendix_1_type) appendix_2_graphs = appendices[2].find_all('P') appendix_2_type = ls.sniff_appendix_id_type(appendix_2_graphs) self.assertIs(appendix_2_type, None)
def test_appendix_id_type_sniffer(self): ls = IdLevelState() p_soup = bS(self.test_xml, 'lxml-xml') appendices = p_soup.find_all('DIV5')[1].find_all('DIV9') appendix_0_graphs = appendices[0].find_all('P') appendix_0_type = ls.sniff_appendix_id_type(appendix_0_graphs) self.assertEqual('appendix', appendix_0_type) appendix_1_graphs = appendices[1].find_all('P') appendix_1_type = ls.sniff_appendix_id_type(appendix_1_graphs) self.assertEqual('section', appendix_1_type) appendix_2_graphs = appendices[2].find_all('P') appendix_2_type = ls.sniff_appendix_id_type(appendix_2_graphs) self.assertIs(appendix_2_type, None)
async def request_data_for_one_county(session: ClientSession, county_data): session = aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) async with session.post(NEW_URL, data=county_data) as req: text = await req.read() _soup = bS(text.decode("utf-8"), "html.parser") office_data = _soup.find(id="pnlClerk").find(class_="card-body").text example = {"\t": None, "\n": " ", "\r": None} table = office_data.maketrans(example) cleaned = office_data.translate(table) res = format_data_into_schema(county_data["CountyName"], cleaned) return res
def review(product): try: file = open("url.txt", "r") contents = file.read() dictionary = ast.literal_eval(contents) productlink = dictionary[product] prodRes = requests.get(productlink) prodRes.encoding = 'utf-8' prod_html = bS(prodRes.text, "html.parser") print(prod_html) commentboxes = prod_html.find_all('div', {'class': "_3nrCtb"}) reviews = [] for commentbox in commentboxes: try: # name.encode(encoding='utf-8') name = commentbox.div.div.find_all( 'p', {'class': '_3LYOAd _3sxSiS'})[0].text except: name = 'No Name' try: # rating.encode(encoding='utf-8') rating = commentbox.div.div.div.div.text except: rating = 'No Rating' try: # commentHead.encode(encoding='utf-8') commentHead = commentbox.div.div.div.p.text except: commentHead = 'No Comment Heading' try: comtag = commentbox.div.div.find_all('div', {'class': ''}) # custComment.encode(encoding='utf-8') custComment = comtag[0].div.text except Exception as e: print("Exception while creating dictionary: ", e) mydict = { "Product": product, "Name": name, "Rating": rating, "CommentHead": commentHead, "Comment": custComment } reviews.append(mydict) return render_template('results.html', reviews=reviews[0:(len(reviews) - 1)]) except Exception as e: print('The Exception message is: ', e) return 'something is wrong' return render_template('results.html')
def imdbScraping(): celebrityNameList = [] celebrityDetails = {} counter = 0 BASE_URL = "http://m.imdb.com/feature/bornondate" #I'm using Selenium tool to extract the content from the IMDB page as this page is dynamic in nature driver = webdriver.Chrome() driver.get(BASE_URL) html = driver.page_source #Creating soup object to html that I have generated with webdriver. soup = bS(html, "html5lib") content = soup.find('section', 'posters list') bornToday = content.findChild('h1').text #Looping through all the 'a' tag that contents all the details required about the Celebrities that we looking too for a in content.findAll('a', 'poster', limit=10): celebrityDetails[counter] = {} #Creating a dictionary that hilds the details of each celebrities ''' for 0 <= counter < 10, create a celebrity deatils of celebrity that we are interested in celebrityDetails{counter: {"celebrityName": "name", "celebrityImage": "image", "celebrityProfession": "profession", "celebrityBestWork": "bestWork" "celebritySentimentAnalysis": "sentimentAnalysis p/n/nt", } } ''' #Extracting all the required details celebrityName = a.find('span', 'title').text celebrityNameList.append(celebrityName) celebrityDetails[counter]["celebrityName"] = celebrityName celebrityDetails[counter]["celebrityImage"] = a.img['src'] Profession, bestWork = a.find('div', 'detail').text.split(",", 1) celebrityDetails[counter]["celebrityProfession"] = Profession celebrityDetails[counter]["celebrityBestWork"] = bestWork counter += 1 #print counter #Returning celebrity name list and the celebrity details. return celebrityNameList, celebrityDetails
async def scrape_one_county(session, county_id, county_name): data = {"idTown": county_id, "SubmitCounty": "Submit", "contactType": "R"} async with session.post(INFO_URL, data=data) as s: text = await s.read() soup = bS(text, "html5lib") table = soup.find("table", {"id": "Table1"}) rows = table.find_all("tr") # Get county registsrar name registrar_name = "" if "County Chief Registrar" in rows[0].getText(): registrar_name = get_county_registrar(rows[0].getText()) # Get mailing and physical addresses phys_address, mail_address = "", "" if ("Physical Address:" in rows[0].getText() and "SAME AS ABOVE" not in rows[0].getText()): phys_info_str = str(rows[0]) phys_address = format_address_html(phys_info_str) mail_info_str = str(rows[1]) mail_address = format_address_html(mail_info_str) # Get phone number phone_number = "" if "Telephone: " in rows[2].getText(): contact_info_str = rows[2].getText() phone_number = get_phone_number(contact_info_str) # Get Email email_address = "" email = soup.find("span", class_="__cf_email__") if email is not None: hex_email = email["data-cfemail"] # function to decode hexadecimal email strings # lifted this off some stackoverflow post lol email_address = electionsaver.decode_email(hex_email) return ( registrar_name, phys_address, mail_address, phone_number, email_address, county_name, )
def getAllHeadlines(category, outputFile): """ will read all the headlines and save in the filename specified :return: none """ url = ["https://marathi.abplive.com/news/"] # using this website to scrap url[0] += category allHeadlines = [] counter = 2 # this website track pages after 2 for link in url: while link: if counter > DATA_LEN: break htmlDoc = '' print(f"Getting............ {link}") req = Request(link, headers={'User-Agent': 'Mozilla/5.0'}) with urlopen(req) as response: for line in response: line = line.decode('utf-8') htmlDoc = htmlDoc + line.replace('\n', '') soup = bS(htmlDoc, 'html.parser') headlineDiv = soup.find_all( 'div', {'class': 'uk-width-3-5 fz20 p-10 newsList_ht'}) for headLine in headlineDiv: article = headLine.text article = re.sub(r"\([^)]*\)", r'', article) article = re.sub(r"\[[^\]]*\]", r'', article) article = re.sub(r"<[^>]*>", r'', article) article = re.sub(r"^https?://.*[\r\n]*", r'', article) article = re.sub(r'^http?://.*[\r\n]*', r'', article) article = article.replace(u'\ufeff', '') article = article.replace(u'\xa0', u'') article = article.replace(' ', ' ') article = article.replace(' , ', ', ') article = article.replace('-', '') article += "\n" allHeadlines.append(article) link = url[0] + "/page-" + str(counter) counter += 1 print(f"Total headlines collected :: {len(allHeadlines)}") with open(outputFile, "w") as file: for headline in allHeadlines: file.write(str(headline)) print("Done......................")
async def get_election_offices(): # page is dynamic--use selenium to execute the javascript before extracting data driver = WTVWebDriver("Missouri").get_webdriver() driver.get(URL) time.sleep(1) soup = bS(driver.page_source, "html.parser") elems = soup.find_all(class_="group") master_list = [] for e in elems: text = [i.strip() for i in e.get_text("\n").split("\n") if i.strip()] county = text[0].split(",")[0].split(" County")[0].split(" Board")[0] street_number_name = text[1] city = text[2].split(",")[0] zip_code = text[2].split()[-1] phone = text[3] website = URL if len(text) == 6 else text[-1] subschema = format_address_data(street_number_name, county) schema = { "countyName": county, "physicalAddress": { "city": city, "state": "Missouri", "zipCode": zip_code, "locationName": subschema["locationName"], }, "phone": phone, "website": website, } if "poBox" in subschema: schema["physicalAddress"]["poBox"] = subschema["poBox"] if "aptNumber" in subschema: schema["physicalAddress"]["aptNumber"] = subschema["aptNumber"] if "streetNumberName" in subschema: schema["physicalAddress"]["streetNumberName"] = subschema[ "streetNumberName"] master_list.append(schema) with open(os.path.join(ROOT_DIR, "scrapers", "missouri", "missouri.json"), "w") as f: json.dump(master_list, f) return master_list
def main(): url = input("Enter the URL - ") #url = "https://www.keepinspiring.me/quotes-about-happiness/" headerS = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"} page = requests.get(url, headers = headerS) soup = bS(page.content , 'html.parser') #print(soup.prettify()) quotes = [] for i in soup.find_all(class_="author-quotes"): quotes.append(i.get_text()) f = open("happyQuotes.txt" ,"w" , encoding="utf-8") for quote in quotes: f.write(quote+"\n") f.close()
def test_table_no_head_rows(self): test_table = ('<DIV>\n' '<TABLE class="gpotbl_table">\n' '<TR>\n' '<TD>\n' 'Cell content\n' '</TD>\n' '</TR>' '</TABLE>' '</DIV>') table_soup = bS(test_table, 'lxml-xml').find('TABLE') table_label = '{table-test-label}' regtable = RegTable(table_label) msg = regtable.parse_xml_table(table_soup) self.assertEqual(msg, "Table is set for {}!".format(table_label)) self.assertNotIn('<thead>', regtable.table())
def test_appendix_graph_parsing(self): ls = IdLevelState() p_soup = bS(self.test_xml, 'lxml-xml') graphs = p_soup.find_all('DIV5')[1].find_all('DIV9')[1].find_all('P') parsed_graph2 = ls.parse_appendix_graph(graphs[2], '1002-A') self.assertIn( "(2) To the extent not included in item 1 above:", parsed_graph2 ) parsed_graph3 = ls.parse_appendix_graph(graphs[3], '1002-A') self.assertIn( "(i) National banks", parsed_graph3 ) ecfr_importer.parse_appendix_paragraphs(graphs, 'appendix', '1002-A') self.assertIn('\n1(a)', p_soup.text)
async def get_election_offices(): """Starting point of the scraper program. Scrapes BASE_URL for election office information and both dumps results to a .json file and returns the results as json. @return: list of scraped results as json. """ # Define coroutine functions (context managers) async with CloudflareScraper() as session: async with session.get(BASE_URL) as s: # ClientResponse.read() is a coroutine function so it must be awaited text = await s.read() soup = bS(text.decode("utf-8"), "html.parser") test_county_data = get_county_codes_and_names(soup) county_data = sorted(test_county_data, key=lambda k: k["countyName"]) num_scraped = 0 master_list = [] # Create list that will store asyncio tasks tasks: List[Task] = [] for county in county_data: code = county["countyCode"] name = county["countyName"] # Create task for a future asynchronous operation and store it in task list tasks.append(asyncio.create_task(scrape_one_county(session, code, name))) # Run the coroutines and iterate over the yielded results as they complete # (out-of-order). Use asyncio.gather() with a couple code modifications to # preserve list order future: Future[Tuple[str, str, str, str]] for future in asyncio.as_completed(tasks): # Unpack awaited result of scrape_one_county() cleaned_string, protected_email, _, county_name = await future schema = format_data_into_schema( cleaned_string, protected_email, county_name ) master_list.append(schema) num_scraped += 1 print( f"[Florida] Scraped {county_name} county: " f"#{num_scraped} of {len(county_data)} .... " f"[{round((num_scraped / len(county_data)) * 100, 2)}%]" ) with open(os.path.join(ROOT_DIR, "scrapers", "florida", "florida.json"), "w") as f: json.dump(master_list, f) return master_list
async def scrape_one_county(session, county_code, county_name): url = BASE_URL + "countyInfo.asp?county=" + county_code # s = scraper.get(url) async with session.get(url) as s: text = await s.read() soup = bS(text.decode("utf-8"), "html.parser") # relevant info is in a random <p> with no classes county_info = soup.find("p", attrs={"class": None}).text hex_email = soup.find("span", class_="__cf_email__")["data-cfemail"] # clean up \t \r \n tags from string example = {"\t": None, "\n": " ", "\r": None} table = county_info.maketrans(example) cleaned = county_info.translate(table) return cleaned, hex_email, county_code, county_name
def test_table_no_head_rows(self): test_table = ( '<DIV>\n' '<TABLE class="gpotbl_table">\n' '<TR>\n' '<TD>\n' 'Cell content\n' '</TD>\n' '</TR>' '</TABLE>' '</DIV>') table_soup = bS(test_table, 'lxml-xml').find('TABLE') table_label = '{table-test-label}' regtable = RegTable(table_label) msg = regtable.parse_xml_table(table_soup) self.assertEqual(msg, "Table is set for {}!".format(table_label)) self.assertNotIn('<thead>', regtable.table())
async def scrape_one_county(session, county_name): county_url = BASE_URL + county_name.lower() async with session.get(county_url) as s: text = await s.read() soup = bS(text, "html5lib") p_tags = soup.findAll('p') address = '' county_website = county_url phone_number = '' email_address = '' director_name = '' # Basically need to make a state machine and parse line by line, initially # scraping address components, then phone number, director, email, etc. This website sucks. # Variable to determine whether we are still scraping address. scraping_address = True for line in p_tags[3:]: if (phone_number == '' and '(' in line.text and ')' in line.text and "Post" not in line.text and "John" not in line.text and "Room" not in line.text): raw_number = line.text phone_number = raw_number.replace('Phone', '').replace( 'Office', '').replace(':', '').strip() # No longer on an address line, so set to false. scraping_address = False if director_name == '' and 'Director' in line.text: end_index = line.text.index('Director') director_name = line.text[:end_index].replace('-', '').strip() if email_address == '' and '@' in line.text: email_address = line.text.strip() if 'Board of Voter Registration' in line.text: county_website = line.find('a')['href'] if scraping_address: address = address + ' ' + line.text return (address, county_website, phone_number, email_address, director_name, county_name)
def test_interp_graph_parsing(self): soup = bS(self.interp_xml, 'lxml-xml') part_soup = soup.find('DIV5') part = parse_part(part_soup, '1002') version = parse_version(part_soup, part) interp_subpart = Subpart( title="Supplement I to Part {}".format(part.part_number), label="Official Interpretations", version=version) interp_subpart.save() interp = [div for div in part_soup.find_all('DIV9') if div.find('HEAD').text.startswith('Supplement I')][0] parse_interps(interp, part, interp_subpart) self.assertEqual( Subpart.objects.filter(title__contains='Supplement I').count(), 1, )
def test_interp_inferred_section_graph_parsing(self): PAYLOAD.reset() self.assertEqual(PAYLOAD.interp_refs, {}) soup = bS(self.interp_xml, 'lxml-xml') parts = soup.find_all('DIV5') part_soup = [div for div in parts if div['N'] == '1030'][0] PAYLOAD.parse_part(part_soup, '1030') part = PAYLOAD.part PAYLOAD.parse_version(part_soup, part) version = PAYLOAD.version interp_subpart = Subpart( title="Supplement I to Part {}".format(part.part_number), label="Official Interpretations", version=version) interp_subpart.save() interp = [div for div in part_soup.find_all('DIV9') if div.find('HEAD').text.startswith('Supplement I')][0] ecfr_importer.parse_interps(interp, part, interp_subpart) self.assertEqual(PAYLOAD.interp_refs['1']['c'], 'see(1-c-Interp)')
def test_interp_graph_parsing(self): soup = bS(self.interp_xml, 'lxml-xml') part_soup = soup.find('DIV5') PAYLOAD.parse_part(part_soup, '1002') part = PAYLOAD.part PAYLOAD.parse_version(part_soup, part) version = PAYLOAD.version interp_subpart = Subpart( title="Supplement I to Part {}".format(part.part_number), label="Official Interpretations", version=version) interp_subpart.save() interp = [div for div in part_soup.find_all('DIV9') if div.find('HEAD').text.startswith('Supplement I')][0] ecfr_importer.parse_interps(interp, part, interp_subpart) self.assertEqual( Subpart.objects.filter(title__contains='Supplement I').count(), 1, )
def test_parse_appendix_elements(self): p_soup = bS(self.test_xml, 'lxml-xml') appendix = p_soup.find('DIV5').find('DIV9') parsed_appendix = ecfr_importer.parse_appendix_elements( appendix, 'A') self.assertIn("**1.", parsed_appendix)
def test_parse_interp_graph_no_id(self): section_graph_element_no_id = bS( "<P>This is a bare interp paragraph with no ID.</P>", 'lxml-xml') parsed_graph = ecfr_importer.parse_interp_graph( section_graph_element_no_id) self.assertTrue(parsed_graph.startswith('This is a bare interp'))
def test_multi_id_paragraph_parsing(self): soup = bS(self.test_xml, 'lxml-xml') graph_soup = soup.find_all('P') parsed_graphs = ecfr_importer.parse_section_paragraphs(graph_soup, '1') self.assertIn('**(a) Delivery of account disclosures**', parsed_graphs)