def _identify_exact_matches(self, classroom_id, first_name, last_name, middle_initial): """Search for exact matches to identify students. "Exact" means match on first name, last name, birthday, and school. """ stripped_first_name = util.clean_string(first_name) stripped_last_name = util.clean_string(last_name) if middle_initial is not None: middle_initial = util.clean_string(middle_initial).upper() # <issue #208> # <remove later> # For the time being, it also means either regular or stripped versions # of names. In the future, we will only process stripped names. normal_q = self._base_identify_query(classroom_id) normal_q.filter('first_name =', first_name) normal_q.filter('last_name =', last_name) normal_q.filter('middle_initial =', middle_initial) # </remove later> # </issue #208> # Query based on stripped names because we expect students to type # their name differently from session to session. Stripping attempts # to make their name uniform and still unique. See util.clean_string(). stripped_q = self._base_identify_query(classroom_id) stripped_q.filter('stripped_first_name =', stripped_first_name) stripped_q.filter('stripped_last_name =', stripped_last_name) stripped_q.filter('middle_initial =', middle_initial) # <issue #208> # <remove later> combined_results = normal_q.fetch(5) + stripped_q.fetch(5) unique_results = list(set(combined_results)) return unique_results
def scrape_urls(self, response): #1. sort through data and extract urls #2. put urls together #3. Loop to each url, returning @parse base_url = "https://www.walmart.com" self.raw = response.body_as_unicode() #print("raw: " + self.raw) remove = ['{', '}', 'Link', ' '] self.cleaned = self.raw for char in remove: self.cleaned = self.cleaned.replace(char, '') self.comma_split = self.cleaned.split('","') #print ("cleaned - " + cleaned) #print ("comma_split - " ) #print (*comma_split) self.colon_split = [entry.split('":"') for entry in self.comma_split] #inspect_response(response, self) self.colon_split[0].remove('"sections') #print ("colon_split - ") #print (*colon_split) self.urls = [entry[-1] for entry in self.colon_split] #print("urls - ") #print(self.urls) section = "unset" subsection = "unset" self.section_dict = {} chars_to_remove=["\'","&"] for entry in self.colon_split: # each entry will have a subheading (normally at 0 unless it has a heading entry) section = clean_string(entry[0],chars_to_remove) url_end = clean_string(entry[-1],"\"") # if its a section header it will contain 3 entries # and all subsequent entries will have the same heading if len(entry) > 2: section = clean_string(entry[0],chars_to_remove) subsection = clean_string(entry[1],chars_to_remove) url = base_url + url_end category=lookup_category("",section,subsection) store_url(self.conn,url,self.store_id,category,section,subsection) #self.section_dict[url] = (self.section, self.subsection) #print(section, subsection, url) next_url=get_next_url(self.cursor, 1) if next_url is None: print("No more urls to parse finishing") else: yield SplashRequest(url, self.parse, endpoint='render.html', args={ 'wait': 10, 'section': section, 'subsection': subsection })
def get_name_and_electronic_votes(): name_votes = {} electronic_votes = {} s3 = soup.find('div', {'class': 'Section3'}) if s3: tags = s3.find_all( text=re.compile(r'Vote\s*nominatif\s*-\s*Naamstemming:')) tags += s3.find_all( text=re.compile(r'Naamstemming\s*-\s*Vote\s*nominatif:')) for i, tag in enumerate(tags): vote_number = extract_vote_number_from_tag(tag, i) vote_header = go_to_p(tag) cancelled, current_node = is_vote_cancelled(vote_header) if cancelled: continue yes, current_node = extract_name_list_from_under_table( current_node.find_next_sibling()) no, current_node = extract_name_list_from_under_table( current_node.find_next_sibling()) abstention = [] # Handles the case where the abstention box is missing (no abstentions) if 'onthoudingen' in current_node.get_text().lower( ) or 'abstentions' in current_node.get_text().lower(): next_vote = go_to_p(tags[ i + 1]).find_previous_sibling() if i + 1 < len( tags) else vote_header.parent.find_all('p')[-1] current_node = next_vote abstention = clean_string(current_node.get_text()) current_node = current_node.find_previous_sibling() # TODO: merge with function while not (current_node.name == "table" or 'naamstemming' in current_node.get_text().lower()): if current_node.get_text(): abstention = clean_string( current_node.get_text()) + ',' + abstention current_node = current_node.find_previous_sibling() abstention = clean_list(abstention.split(',')) name_votes[vote_number] = (yes, no, abstention) tags = s3.find_all(text=re.compile( r'Comptage\s*électronique\s*–\s*Elektronische telling:')) for i, tag in enumerate(tags): vote_number = extract_vote_number_from_tag(tag, i) vote_header = go_to_p(tag) cancelled, current_node = is_vote_cancelled(vote_header) if cancelled: continue electronic_votes[vote_number] = current_node return name_votes, electronic_votes
def extract_name_list_from_under_table(current_node): name_list = clean_string(current_node.get_text()) while not (current_node.name == "table" or 'naamstemming' in current_node.get_text().lower()): if current_node.get_text(): name_list += ',' + clean_string(current_node.get_text()) current_node = current_node.find_next_sibling() name_list = clean_list(name_list.split(',')) return name_list, current_node
def extract_title_by_vote(table: NavigableString, language: Language): class_name = Meeting.language_mapping[language][1] next_line = table.find_previous_sibling("p", {"class": class_name}) while not re.match(r"([0-9]+) (.)*", clean_string(next_line.text)): next_line = next_line.find_previous_sibling( "p", {"class": class_name}) match = re.match(r"([0-9]+) (.*)", clean_string(next_line.text)) return int(match.group(1))
def check_top_achat(urls): out_results = [] for url in urls: tree = util.get_tree(url) nb_resultats = tree.xpath( '//*[@id="content"]/nav[1]/ul/li[4]/text()')[0] nb = util.make_num(nb_resultats) results = [] liste_prix_ = tree.xpath( "//section[@class = 'produits list']//div[@itemprop= 'price']/text()" ) liste_titres = tree.xpath( "//section[@class = 'produits list']//div[@class = 'libelle']/a/h3/text()" ) liste_dispos = tree.xpath( "//section[@class = 'produits list']//section[last()]/@class") for i in range(0, int(nb)): prix_ = liste_prix_[i][0:-4] prix = util.make_num(prix_) if (int(prix) >= 850): continue titre = liste_titres[i] geforce_ad = " + 1 an d'abonnement GeForce Now offert ! ".lower() call_of_ad = "+ Call of Duty: Black Ops Cold War offert ! ".lower() if ('water' in titre.lower() or 'hydro' in titre.lower()): continue elif (geforce_ad in titre.lower()): titre = titre[0:len(titre) - len(geforce_ad)] elif (call_of_ad in titre.lower()): titre = titre[0:len(titre) - len(call_of_ad)] raw_dispo = liste_dispos[i] dispo = "" if (raw_dispo == 'en-rupture'): dispo = 'Rupture' elif (raw_dispo == 'dispo-sous-7-jours'): dispo = 'sous 7 jours' elif (raw_dispo == 'dispo-entre-7-15-jours'): dispo = 'entre 7-15 jours' elif (raw_dispo == 'dispo-plus-15-jours'): dispo = '+ de 15 jours' else: dispo = raw_dispo results.append( ('topachat.com ' + util.clean_string(titre), dispo, util.clean_string(prix))) out_results += results return out_results
def check_pc_componentes(urls): out_results = [] for url in urls: tree = util.get_tree(url) titres = tree.xpath( f"//div[@class = 'c-product-card__content']/header/h3/a/text()") prixs = tree.xpath( f"//div[@class = 'c-product-card__content']/div[2]/div/span/text()" ) dispos = tree.xpath( f"//div[@class = 'c-product-card__content']/div[3]/text()") results = [] for titre, prix, dispo in zip(titres, prixs, dispos): if (',' in prix): prix = util.make_num(prix[0:-4]) else: prix = util.make_num(prix) if (int(prix) >= 850): continue if 'rtx' not in titre.lower(): continue avoid_bool = False avoid_words = [ 'reacondicionado', 'recondicionado', 'water', 'hydro', 'ekwb', 'intel', 'ryzen', '2080', '2070', 'i7', 'i5', 'Vector' ] for a in avoid_words: if a in util.clean_string(titre.lower()): avoid_bool = True break if avoid_bool: continue if (util.clean_string(dispo).lower() == "sin fecha de entrada"): dispo = "Rupture" else: dispo = "Check dispo" results.append( ('pccomponentes.com ' + util.clean_string(titre), dispo, util.clean_string(prix))) out_results += results return out_results
def ldlc_targeted(url): tree = util.get_tree(url) name = tree.xpath("/html/body/div[3]/div[2]/div[1]/h1/text()")[0] dispo = tree.xpath( "/html/body/div[3]/div[2]/div[2]/div[3]/aside/div[4]/div[1]/div[2]/div/span/text()" )[0] prix_ = tree.xpath( "/html/body/div[3]/div[2]/div[2]/div[3]/aside/div[1]/div/text()" )[0][0:-1] prix = util.make_num(prix_) return (util.clean_string(name), util.clean_string(dispo), util.clean_string(prix))
def counter_processing( dataframe, is_zip_root, sanitize_dev=False ): word_set_diff = set() # Used for storing words/phrases to later be similarilty checked d = dataframe # Dict to aggregate counts nested_dict = defaultdict(Counter) cross_ref_fix = 0 # Option 2 related only, cross-ref counter # Primary loop to locate, and increment. Cleanse functions are used here. for row in range(len(d)): # Yank zip = d.loc[row, 'incident_zip'] complaint = d.loc[row, 'complaint_type'] borough = d.loc[row, 'borough'] # Clean zip = verify_clean_zip(zip) complaint = clean_string(complaint) borough = clean_string(borough) # Dev - String Similarity sets word_set_diff.add(borough) word_set_diff.add(complaint) print(borough, " - ", complaint, " - ", zip) # Raw print as rows iterate if (is_zip_root): # Option 1 at menu (Zip is parent/root) nested_dict[zip][complaint] += 1 elif ('unspecified' not in borough): # Option 2 always - with unspecified check nested_dict[borough][complaint] += 1 elif ("unspecified" in borough and zip is not None): # Option 2 but bad borough string print(borough, zip) if attempt_borough_from_zip(zip): borough = attempt_borough_from_zip(zip) # Attempting cross reference to find borough if borough: cross_ref_fix += 1 nested_dict[clean_string(borough)][complaint] += 1 # Success on cross reference! else: print("No Cross Reference Found") print("\n" * 5, " -------- \n") pprint(dict(nested_dict)) # Print out final structure if (not is_zip_root): print("FIXED CROSS REFERENCED:", cross_ref_fix) # Kicks off fuzzy-wuzzy checking (Option 5) if sanitize_dev: print("\n\n-- FUZZY CHECKING --") handle_similarity_debug(word_set_diff)
def test_clean_string(self): """Test that clean_string() returns only lowercase a-z of type str.""" strings_to_clean = [ u'Nicholas', u'Nicolás', u'N1colas', u'N#$colas', u'Nichol"a"s', u'Nich\olas', '12345', # Some schools want to use ids rather than last names 'Nicholas', 'N1colas', 'N#$colas', 'Nichol"a"s', "Nich\olas", # This guy *shouldn't* fail, but it won't return what we want it to # (This isn't a problem right now, because the front end is serving # us unicode objects, not strs.): 'Nicolás', ] for index, test_string in enumerate(strings_to_clean): # Nothing but lowercase alphabetic characters and digits, # beginning to end. pattern = r'^[a-z0-9]+$' cleaned_string = util.clean_string(test_string) # re.match() will return None if the pattern doesn't match. self.assertIsNotNone(re.match(pattern, cleaned_string), 'string index: {}'.format(index)) # output must always be a string (not unicode) self.assertIsInstance(cleaned_string, str)
def _identify_partial_matches(self, classroom_id, last_name): """Search for partial matches to identify students. "Partial" means we don't use first name. """ stripped_last_name = util.clean_string(last_name) # <issue #208> # <remove later> # For the time being, it also means either regular or stripped versions # of names. In the future, we will only process stripped names. normal_q = self._base_identify_query(classroom_id) normal_q.filter('last_name =', last_name) # </remove later> # </issue #208> # Query based on stripped names because we expect students to type # their name differently from session to session. Stripping attempts # to make their name uniform and still unique. See util.clean_string(). stripped_q = self._base_identify_query(classroom_id) stripped_q.filter('stripped_last_name =', stripped_last_name) # <issue #208> # <remove later> combined_results = normal_q.fetch(5) + stripped_q.fetch(5) unique_results = list(set(combined_results)) return unique_results
def _identify_partial_matches(self, cohort_id, last_name): """Search for partial matches to identify students. Pulls data from a special set of memcache keys, which are updated by cron, and provide the names of all students in the school. All the names are examined to see if the typed name is contained in or contained by the existing name ("containment matching"), which are considered partial matches. Then the matches are ordered by their similarity (Levenshtein distance) to the typed name. """ stripped_last_name = util.clean_string(last_name) match_data, from_memcache = self.internal_api.get_roster(cohort_id) # White list necessary properties (no sense in releasing status codes # like 'Parent Refusal' to the public). def clean_properties(d): white_list = [ 'first_name', 'last_name', 'classroom_name', 'id', 'stripped_last_name' ] return {k: v for k, v in d.items() if k in white_list} # Containment matching. matches = [ clean_properties(u) for u in match_data if u['stripped_last_name'] in stripped_last_name or stripped_last_name in u['stripped_last_name'] ] # Order by edit (Levenshtein) distance from the submitted name. sort_func = lambda n: util.levenshtein_distance( n['stripped_last_name'], stripped_last_name) return sorted(matches, key=sort_func)
def check_ldlc(urls): out_results = [] for url in urls: tree = util.get_tree(url) nb_resultats = tree.xpath( '/html/body/div[3]/div/div[3]/div[1]/div/div[2]/div[1]/div[1]/text()' )[0] nb = util.make_num(nb_resultats) #48 is the maximum of items in a page if int(nb) > 48: nb = 48 results = [] for i in range(1, int(nb) + 1): prix_ = tree.xpath( f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[4]/div[1]/div/text()" )[0] prix = util.make_num(prix_) if (int(prix) >= 850): continue titre = tree.xpath( f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[1]/div[1]/h3/a/text()" )[0] if ('water' in titre.lower() or 'hydro' in titre.lower()): continue dispo = tree.xpath( f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[3]/div/div[2]/div/span/text()" )[0] dispo_p2 = tree.xpath( f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[3]/div/div[2]/div/span/em/text()" ) if len(dispo_p2) >= 1: dispo = dispo + ' ' + dispo_p2[0] results.append(('LDLC.com ' + util.clean_string(titre), util.clean_string(dispo), util.clean_string(prix))) out_results += results return out_results
def from_table(meeting_topic, vote_number: int, vote_rows: NavigableString): """Generate a new Vote from a parsed table. Args: vote_number (int): Number of the vote in this meeting (e.g. 1) vote_rows (NavigableString): Vote rows as obtained by BeautifulSoup Returns: Vote: """ yes = int(clean_string(vote_rows[1].find_all( 'td')[1].find('p').get_text())) no = int(clean_string(vote_rows[2].find_all( 'td')[1].find('p').get_text())) abstention = int(clean_string( vote_rows[3].find_all('td')[1].find('p').get_text())) return GenericVote(meeting_topic, vote_number, yes, no, abstention)
def parse_topics(language): classes = Meeting.language_mapping[language] titles = soup.find_all('p', {'class': classes[1]}) current_title = "" while titles: item = titles.pop() if not clean_string(item.text): continue while not re.match("([0-9]+) (.*)", clean_string( item.text)): current_title = clean_string( item.text) + '\n' + current_title item = titles.pop() m = re.match("([0-9]+) (.*)", clean_string(item.text)) current_title = m.group(2) + '\n' + current_title section = item.find_previous_sibling( "p", {"class": classes[0]}) item = int(m.group(1)) if not item in self.topics: self.topics[item] = MeetingTopic( self.parliamentary_session, self, item) self.topics[item].set_title(language, current_title.rstrip()) self.topics[item].set_section( language, clean_string(section.text) if section else ("Algemeen" if language == Language.NL else "Generale")) self.topics[item].complete_type() if language == Language.NL: title = normalize_str( current_title.rstrip().lower()).decode() for member in self.parliamentary_session.get_members(): if member.normalized_name() in title: member.post_activity( TopicActivity(member, self, self.topics[item])) current_title = ""
def _identify_exact_matches(self, classroom_id, first_name, last_name): """Search for exact matches to identify students. "Exact" means match on first name, last name, and classroom. """ stripped_first_name = util.clean_string(first_name) stripped_last_name = util.clean_string(last_name) logging.info( "Querying for exact match on is_test: False, user_type: student, " "classroom: {}, stripped_first_name: {}, stripped_last_name: {}". format(classroom_id, stripped_first_name, stripped_last_name)) # Query based on stripped names because we expect students to type # their name differently from session to session. Stripping attempts # to make their name uniform and still unique. See util.clean_string(). stripped_q = self._base_identify_query(classroom_id) stripped_q.filter('stripped_first_name =', stripped_first_name) stripped_q.filter('stripped_last_name =', stripped_last_name) return stripped_q.fetch(5)
def from_table(meeting_topic, vote_number: int, vote_rows: NavigableString): """Generate a new Vote from a parsed table. Args: meeting_topic (MeetingTopic): The meeting topic vote_number (int): Number of the vote in this meeting (e.g. 1) vote_rows (NavigableString): Vote rows as obtained by BeautifulSoup Returns: Vote: """ yes_fr = int(clean_string( vote_rows[2].find_all('td')[1].find('p').get_text())) no_fr = int(clean_string( vote_rows[3].find_all('td')[1].find('p').get_text())) abstention_fr = int(clean_string( vote_rows[4].find_all('td')[1].find('p').get_text())) yes_nl = int(clean_string( vote_rows[2].find_all('td')[3].find('p').get_text())) no_nl = int(clean_string( vote_rows[3].find_all('td')[3].find('p').get_text())) abstention_nl = int(clean_string( vote_rows[4].find_all('td')[3].find('p').get_text())) return LanguageGroupVote(meeting_topic, vote_number, GenericVote(meeting_topic, vote_number, yes_nl, no_nl, abstention_nl), GenericVote(meeting_topic, vote_number, yes_fr, no_fr, abstention_fr))
def parse(self, response): page_1_str=self.page_str+"1" this_url = trim_url(response.url,page_1_str) print (f"inside parse for {this_url}") self.scrape_urls(response) # Only scrape pages that have the page_str in the url. if this_url.find(self.page_str) != -1: print (f"scraping for {this_url}") items = response.css('product-item-v2') print(f"length of items - {len(items)}") metadata=get_url_metadata(self.cursor,this_url) section=metadata[1] subsection=metadata[2] for item in items: name = item.css('.product-title ::text').get() price_strings = item.css('.product-price ::text').getall() price = clean_string(price_strings[-1],['$']) ppu = item.css('.product-price-qty ::text').get() unit = self.collect_units(name) #inspect_response(response,self) if unit == "OZ" or unit == "LB": ounces = self.collect_ounces(name) else: ounces = 0 print (f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}") yield{ "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": this_url, "section": section, "subsection": subsection } #Basically the website redirects us to the url and page_1_str, which isn't added to our database # So we trim that off so we can get the url in our database finish_url(self.conn,self.store_id,this_url) print("finishing url - " + this_url) next_url = get_next_url(self.cursor, 1) if next_url is None: print ("Next url is none therefore we must be finished ! ") return else: next_request = create_parse_request(next_url, self.check_location, EC.element_to_be_clickable((By.CSS_SELECTOR,'#openFulfillmentModalButton'))) print(f"got next_url - {next_url}") yield next_request
def electronic_vote_from_table(meeting_topic, vote_number: int, vote_start_node: NavigableString): """Generate a new electronic (advisory or generic) vote from a parsed table. Args: meeting_topic (MeetingTopic): The meeting topic vote_number (int): Number of the vote in this meeting (e.g. 1) vote_start_node (NavigableString): Vote start node as obtained by BeautifulSoup Returns: Vote: """ yes = int(clean_string(vote_start_node.find_all( 'td')[1].find('p').get_text())) vote_end_node = vote_start_node.find_next_sibling().find_next_sibling() if not vote_end_node or vote_end_node.name != 'table': return ElectronicAdvisoryVote(meeting_topic, vote_number, yes) no = int(clean_string(vote_end_node.find_all( 'td')[1].find('p').get_text())) return ElectronicGenericVote(meeting_topic, vote_number, yes, no)
def update_value(self,name,ts,val): "insert value into appropriate table " if val is None: return if ts is None or ts < self.MIN_TIME: ts = time.time() self.pvinfo[name]['last_ts'] = ts self.pvinfo[name]['last_value'] = val info = self.pvinfo[name] try: self.db.execute(self.sql_insert % (info['data_table'],info['id'], ts,clean_string(val))) except TypeError: self.write("cannot update %s\n" % name)
def from_table(meeting_topic, vote_number: int, vote_rows: NavigableString): """Generate a new Vote from a parsed table. Args: vote_number (int): Number of the vote in this meeting (e.g. 1) vote_rows (NavigableString): Vote rows as obtained by BeautifulSoup Returns: Vote: """ yes_str = clean_string(vote_rows[1].find_all( 'td')[1].find('p').get_text()) if not yes_str: # Sometimes, tables are empty... example: https://www.dekamer.be/doc/PCRI/html/55/ip100x.html return None yes = int(yes_str) no = int(clean_string(vote_rows[2].find_all( 'td')[1].find('p').get_text())) abstention = int(clean_string( vote_rows[3].find_all('td')[1].find('p').get_text())) return GenericVote(meeting_topic, vote_number, yes, no, abstention)
def check_materiel(url_list, web_driver): output_results = [] for url in url_list: web_driver.get(url) nb_resultats = web_driver.find_element_by_xpath( '//*[@id="tabProducts"]').text nb = util.make_num(nb_resultats) if int(nb) > 48: nb = 48 results = [] for i in range(1, int(nb) + 1): prix_ = web_driver.find_element_by_xpath( f"//*[@data-position = '{i}']/div[4]/div[1]/span").text[0:-2] prix = util.make_num(prix_) if (int(prix) >= 850): continue titre = web_driver.find_element_by_xpath( f"//*[@data-position = '{i}']/div[2]/a/h2").text if ('water' in titre.lower() or 'hydro' in titre.lower()): continue dispo = web_driver.find_element_by_xpath( f"//*[@data-position = '{i}']/div[3]/div/span[2]").text if dispo == 'RUPTURE': dispo = "Rupture" results.append(('Materiel.net ' + util.clean_string(titre), util.clean_string(dispo), util.clean_string(prix))) output_results += results return output_results
def get_quantity(self): quantity_selector = ( "body > app-root > div > hts-layout > span > hts-shop-by-category > div > " "section > div > div.product-category-list.col-lg-7.col-md-9.column7 > " "div.smart-filter.clearfix > h2 > span") ret = 0 try: quantity = self.driver.find_element_by_css_selector( quantity_selector).text quantity = clean_string(quantity, ['(', ')']) if ret is None: ret = 0 ret = int(quantity) except NoSuchElementException: ret = 0 print(f"in get_quantity - found quantity of {ret}") return ret
def parse(self, response): self.driver = response.request.meta['driver'] close_modal(self) change_store_location(self) url = response.url metadata = get_url_metadata(self.cursor, url) section = metadata[1] subsection = metadata[2] #check if it has a next button, items = response.css('.cell-content-wrapper') for item in items: name = item.css('.cell-title-text ::text').get() name = clean_string(name, ['\"']) price = item.css('[data-test="amount"] .css-19m8h51 ::text').get() price = convert_dollars(price) quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get() unit = item.css('.cell-product-size ::text').get() ounces = convert_to_ounces(unit) ppu = item.css('[data-test="per-unit-price"] ::text').get() ppu = convert_ppu(ppu) self.logger.info( f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} " ) #inspect_response(response,self) yield { "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": url, "section": section, "subsection": subsection } finish_url(self.conn, self.store_id, url) request = self.get_next_request() yield request
def collect_ounces(self,string): split = string.split(' - ') ounces = 0 if len(split) == 1: print (f"No -'s found in {string} - not updating ounces") elif len(split) == 2: weight = split[1] ounces = convert_to_ounces(weight) elif len(split) == 3: quantity = split[1] weight = convert_to_ounces(split[2]) quantity = clean_string(quantity,["Count"]) if quantity.isdigit(): quantity=int(quantity) else: quantity=1 ounces = weight * quantity else: print(f"Collect_ounces too many '-'s in string {string}") return ounces
def get_market(): #util.clean_market(0) url = 'http://pregao-online.bmfbovespa.com.br/Cotacoes.aspx' soup = BeautifulSoup(urlfetch.fetch(url, deadline=50).content, 'lxml') rate = util.get_exchange() #dt = get_datetime() dt = datetime.datetime.now(tz.tzstr('EBST3EBDT')) market = Market(ref=0, date=dt.date(), time=dt.time(), exchange_rate=rate) market.put() table = soup('table', attrs={'id': 'ctl00_DefaultContent_GrdCarteiraIndice'})[0] for tr in table('tr')[1:]: tds = tr('td') code = str(tds[0].string) name = util.clean_string(tds[1].string) value = util.get_float(tds[2].string) diff = util.get_float(tds[3].text.strip()) stock = Stock(name=util.get_or_create_name(0, code, name), value=value, diff=diff, market=market.key()) stock.put()
def get_anken(self): fp = urllib2.urlopen(self.url) html = fp.read() fp.close() html = unicode(html, 'euc_jp', 'ignore') self.html = util.clean_string(html) # print(self.html) self.anken = dao_anken.ClassAnken() self.anken.nyusatsu_system = 1 self.anken.nyusatsu_type = 1 self.anken.anken_url = self.url self.anken.keishu_cd = self.keishu_cd self.anken.public_flag = self.public_flag self.anken.anken_no = self.get_anken_no() self.anken.anken_name = self.get_anken_name() self.anken.keishu_name = self.get_keishu_name() self.anken.company_area = self.get_company_area() self.anken.anken_open_date = self.get_anken_open_date() self.anken.anken_close_date = self.get_anken_close_date() self.anken.tender_date = self.get_tender_date() self.anken.tender_place = self.get_tender_place() self.anken.limit_date = self.get_limit_date() self.anken.gyoumu_kbn_1 = self.get_gyoumu_kbn_1() self.anken.gyoumu_kbn_2 = self.get_gyoumu_kbn_2() self.anken.kasitu_name = self.get_kasitu_name() self.anken.tanto_name = self.get_tanto_name() self.anken.notes = self.get_notes() self.anken.result_open_date = self.get_result_open_date() self.anken.result_close_date = self.get_result_close_date() self.anken.raku_name = self.get_raku_name() self.anken.price = self.get_price() self.anken.attached_file_1 = self.get_attached_file_1() self.anken.attached_file_2 = self.get_attached_file_2() self.anken.attached_file_3 = self.get_attached_file_3()
def check_nvidia(url, web_driver): web_driver.get(url) num = int( util.make_num( web_driver.find_element_by_xpath( '/html/body/app-root/product/div[1]/div[1]/div[2]/div/suggested-product/div/div' ).text)) results = [] name = web_driver.find_element_by_xpath( '//featured-product/div/div/div[2]/div[2]/h2').text dispo = web_driver.find_element_by_xpath( '//featured-product/div/div/div[2]/div[3]/div[1]/div[2]/a').text prix = util.make_num( web_driver.find_element_by_xpath( '//featured-product/div/div/div[2]/div[3]/div[1]/div[1]/div/span[1]' ).text) if dispo == "RUPTURE DE STOCK": dispo = "Rupture" results.append( ("FE " + util.clean_string(name), util.clean_string(dispo), util.clean_string(prix))) if num == None: num = 2 for i in range(1, num): name = web_driver.find_element_by_xpath( f'//*[@id="resultsDiv"]/div/div[{i}]/div[2]/h2').text dispo = web_driver.find_element_by_xpath( f'//*[@id="resultsDiv"]/div/div[{i}]/div[3]/div[2]/div[2]/a').text prix = util.make_num( web_driver.find_element_by_xpath( f'//*[@id="resultsDiv"]/div/div[{i}]/div[3]/div[2]/div[1]/div/span[1]' ).text) if dispo == "RUPTURE DE STOCK": dispo = "Rupture" results.append(("FE " + util.clean_string(name), util.clean_string(dispo), util.clean_string(prix))) return results
def get_sql(self): return util.clean_string(self.sql)
def clean_string(self, s): return clean_string(s) def string_literal(self, s): return string_literal(s)
def load_data(path, file_ext=['txt'], valid_split=None, vocab_file_name=None, max_vocab_size=None, max_len_w=None, output_path=None, subset_pct=100): """ Given a path where data are saved, look for the ones with the right extensions If a split factor is given, it will split all the files into training and valid set. Then build vocabulary from the training and validation sets. Arguments: path: which directory to look for all the documents file_ext: what extension of the files to look for valid_split: to split the data into train/valid set. If None, no split vocab_file_name: optional file name. If None, the script will decide a name given path and split max_vocab_size: maximum number of words to use in vocabulary (by most frequent) max_len_w: maximum length of sentences in words output_path: path used to save preprocessed data and resuts subset_pct: subset of dataset to load into H5 file (percentage) Returns: The function saves 2 files: h5 file with preprocessed data vocabulary file with: vocab, reverse_vocab, word_count """ file_names = get_file_list(path, file_ext) file_str = get_file_str(path, len(file_names), labelled=False, valid_split=valid_split, subset_pct=subset_pct) # create output dir if needed if not os.path.isdir(output_path): os.makedirs(output_path) # file name to store the vocabulary if vocab_file_name is None: vocab_file_name = file_str + '.vocab' vocab_file_name = os.path.join(output_path, vocab_file_name) # If max sizes arent set, assume no limit if not max_len_w: max_len_w = sys.maxsize if not max_vocab_size: max_vocab_size = sys.maxsize # file name to store the pre-processed train/valid dataset h5_file_name = os.path.join(output_path, file_str + '.h5') if os.path.exists(h5_file_name) and os.path.exists(vocab_file_name): neon_logger.display( "dataset files {} and vocabulary file {} already exist. " "will use cached data. ".format(h5_file_name, vocab_file_name)) return h5_file_name, vocab_file_name # split into training/valid set if valid_split is not None: if 'json' in file_ext: # Split based on number of files train_split = int(np.ceil(len(file_names) * (1 - valid_split))) train_files = file_names[:train_split] valid_files = file_names[train_split:] train_sent = load_json_sent(train_files, subset_pct) valid_sent = load_json_sent(valid_files, subset_pct) all_sent = train_sent + valid_sent elif 'txt' in file_ext: # Split based on number of lines (since only 2 files) all_sent = load_txt_sent(file_names, subset_pct) train_split = int(np.ceil(len(all_sent) * (1 - valid_split))) train_sent = all_sent[:train_split] valid_sent = all_sent[train_split:] else: neon_logger.display( "Unsure how to load file_ext {}, please use 'json' or 'txt'.". format(file_ext)) else: train_files = file_names if 'json' in file_ext: train_sent = load_json_sent(train_files, subset_pct) elif 'txt' in file_ext: train_sent = load_txt_sent(train_files, subset_pct) else: neon_logger.display( "Unsure how to load file_ext {}, please use 'json' or 'txt'.". format(file_ext)) all_sent = train_sent if os.path.exists(vocab_file_name): neon_logger.display( "open existing vocab file: {}".format(vocab_file_name)) vocab, rev_vocab, word_count = load_obj(vocab_file_name) else: neon_logger.display("Building vocab file") # build vocab word_count = defaultdict(int) for sent in all_sent: sent_words = tokenize(sent) if len(sent_words) > max_len_w or len(sent_words) == 0: continue for word in sent_words: word_count[word] += 1 # sort the word_count , re-assign ids by its frequency. Useful for downstream tasks # only done for train vocab vocab_sorted = sorted(word_count.items(), key=lambda kv: kv[1], reverse=True) vocab = OrderedDict() # get word count as array in same ordering as vocab (but with maximum length) word_count_ = np.zeros((len(word_count), ), dtype=np.int64) for i, t in enumerate(list(zip(*vocab_sorted))[0][:max_vocab_size]): word_count_[i] = word_count[t] vocab[t] = i word_count = word_count_ # generate the reverse vocab rev_vocab = dict((wrd_id, wrd) for wrd, wrd_id in vocab.items()) neon_logger.display("vocabulary from {} is saved into {}".format( path, vocab_file_name)) save_obj((vocab, rev_vocab, word_count), vocab_file_name) vocab_size = len(vocab) neon_logger.display( "\nVocab size from the dataset is: {}".format(vocab_size)) neon_logger.display( "\nProcessing and saving training data into {}".format(h5_file_name)) # now process and save the train/valid data h5f = h5py.File(h5_file_name, 'w', libver='latest') shape, maxshape = (len(train_sent), ), (None) dt = np.dtype([('text', h5py.special_dtype(vlen=str)), ('num_words', np.uint16)]) report_text_train = h5f.create_dataset('report_train', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') report_train = h5f.create_dataset('train', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') # map text to integers wdata = np.zeros((1, ), dtype=dt) ntrain = 0 for sent in train_sent: text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)] # enforce maximum sentence length if len(text_int) > max_len_w or len(text_int) == 0: continue report_train[ntrain] = text_int wdata['text'] = clean_string(sent) wdata['num_words'] = len(text_int) report_text_train[ntrain] = wdata ntrain += 1 report_train.attrs['nsample'] = ntrain report_train.attrs['vocab_size'] = vocab_size report_text_train.attrs['nsample'] = ntrain report_text_train.attrs['vocab_size'] = vocab_size if valid_split: neon_logger.display( "\nProcessing and saving validation data into {}".format( h5_file_name)) shape = (len(valid_sent), ) report_text_valid = h5f.create_dataset('report_valid', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') report_valid = h5f.create_dataset( 'valid', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') nvalid = 0 for sent in valid_sent: text_int = [ -1 if t not in vocab else vocab[t] for t in tokenize(sent) ] # enforce maximum sentence length if len(text_int) > max_len_w or len(text_int) == 0: continue report_valid[nvalid] = text_int wdata['text'] = clean_string(sent) wdata['num_words'] = len(text_int) report_text_valid[nvalid] = wdata nvalid += 1 report_valid.attrs['nsample'] = nvalid report_valid.attrs['vocab_size'] = vocab_size report_text_valid.attrs['nsample'] = nvalid report_text_valid.attrs['vocab_size'] = vocab_size h5f.close() return h5_file_name, vocab_file_name
words = 50 #for choosing out of n best word specify n n_bestwords = 100 #number of original (!) ingredient samples for random test generation m = 50 t = pickle.load(open('tokenizer.pickle', 'rb')) reverse_word_map = dict(map(reversed, t.word_index.items())) print(t.word_index) model = load_model('model.hdf5') max_list_predictors = model._layers[0].batch_input_shape[1] string = 'Mehl, Eier, Milch Die Eier mit der Milch verrühren und' print( clean_string( generate_equal(string, model, t, reverse_word_map, words, max_list_predictors))) for i in range(2, 15): string = generate_testcases('recipes.json', m, i) print('Test ingredients:', string) print('choose equal plus threshold:') print( clean_string( generate_equal(string, model, t, reverse_word_map, words, max_list_predictors))) print('choose from ' + str(n_bestwords) + ' best:') print( clean_string( generate_choose_from_n_best(n_bestwords, string, model, t, reverse_word_map, words,
def parse(self, response): url = response.url finish_url(self.conn, self.store_id, url) items = response.css('.cell-content-wrapper') metadata = get_url_metadata(self.cursor, url) section = metadata[1] subsection = metadata[2] #check if it has a next button, next_page = response.css('.pagination-next:not(.disabled)').get() if next_page is not None: #inspect_response(response,self) page_string = "?page=" page_str_len = len(page_string) i = url.find(page_string) #if yes, check url if it has a page part on it if i == -1: #if no, add ?page=2 to it next_url = url + page_string + "2" else: #if yes, extract page and add 1 page_number = i + page_str_len current_page = int(url[page_number:]) next_page = current_page + 1 next_url = url[:page_number] + str(next_page) #then add to self.urls store_url(self.conn, next_url, self.store_id, lookup_category("", section, subsection), section, subsection) for item in items: name = item.css('.cell-title-text ::text').get() name = clean_string(name, ['\"']) price = item.css('[data-test="amount"] .css-19m8h51 ::text').get() price = convert_dollars(price) quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get() unit = item.css('.cell-product-size ::text').get() ounces = convert_to_ounces(unit) ppu = item.css('[data-test="per-unit-price"] ::text').get() ppu = convert_ppu(ppu) print( f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} " ) #inspect_response(response,self) yield { "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": url, "section": section, "subsection": subsection } next_url = get_next_url(self.cursor, 1) if next_url is None: print("No more URLs to parse. Finishing") return request = self.create_parse_request( next_url, self.parse, EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]'))) if next_url is not None: try: yield request except: print( f"Parse - Errored out processing request for - {next_url} " ) next_url = get_next_url(self.cursor, 2) print(f"Parse - Now handling {next_url}") request = self.create_parse_request( next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[add-to-cart]'))) yield SeleniumRequest(url=next_url, callback=self.parse, wait_time=50, wait_until=EC.element_to_be_clickable( (By.CSS_SELECTOR, '.button.full.cart.add')))
def drop_pv(self,name): self.db.execute("update pv set active='no' where name=%s" % clean_string(name))
def load_data(path, file_ext=['txt'], valid_split=None, vocab_file_name=None, max_vocab_size=None, max_len_w=None, output_path=None, subset_pct=100): """ Given a path where data are saved, look for the ones with the right extensions If a split factor is given, it will split all the files into training and valid set. Then build vocabulary from the training and validation sets. Arguments: path: which directory to look for all the documents file_ext: what extension of the files to look for valid_split: to split the data into train/valid set. If None, no split vocab_file_name: optional file name. If None, the script will decide a name given path and split max_vocab_size: maximum number of words to use in vocabulary (by most frequent) max_len_w: maximum length of sentences in words output_path: path used to save preprocessed data and resuts subset_pct: subset of dataset to load into H5 file (percentage) Returns: The function saves 2 files: h5 file with preprocessed data vocabulary file with: vocab, reverse_vocab, word_count """ file_names = get_file_list(path, file_ext) file_str = get_file_str(path, len(file_names), labelled=False, valid_split=valid_split, subset_pct=subset_pct) # create output dir if needed if not os.path.isdir(output_path): os.makedirs(output_path) # file name to store the vocabulary if vocab_file_name is None: vocab_file_name = file_str + '.vocab' vocab_file_name = os.path.join(output_path, vocab_file_name) # If max sizes arent set, assume no limit if not max_len_w: max_len_w = sys.maxsize if not max_vocab_size: max_vocab_size = sys.maxsize # file name to store the pre-processed train/valid dataset h5_file_name = os.path.join(output_path, file_str + '.h5') if os.path.exists(h5_file_name) and os.path.exists(vocab_file_name): neon_logger.display("dataset files {} and vocabulary file {} already exist. " "will use cached data. ".format(h5_file_name, vocab_file_name)) return h5_file_name, vocab_file_name # split into training/valid set if valid_split is not None: if 'json' in file_ext: # Split based on number of files train_split = int(np.ceil(len(file_names) * (1 - valid_split))) train_files = file_names[:train_split] valid_files = file_names[train_split:] train_sent = load_json_sent(train_files, subset_pct) valid_sent = load_json_sent(valid_files, subset_pct) all_sent = train_sent + valid_sent elif 'txt' in file_ext: # Split based on number of lines (since only 2 files) all_sent = load_txt_sent(file_names, subset_pct) train_split = int(np.ceil(len(all_sent) * (1 - valid_split))) train_sent = all_sent[:train_split] valid_sent = all_sent[train_split:] else: neon_logger.display("Unsure how to load file_ext {}, please use 'json' or 'txt'." .format(file_ext)) else: train_files = file_names if 'json' in file_ext: train_sent = load_json_sent(train_files, subset_pct) elif 'txt' in file_ext: train_sent = load_txt_sent(train_files, subset_pct) else: neon_logger.display("Unsure how to load file_ext {}, please use 'json' or 'txt'." .format(file_ext)) all_sent = train_sent if os.path.exists(vocab_file_name): neon_logger.display("open existing vocab file: {}".format(vocab_file_name)) vocab, rev_vocab, word_count = load_obj(vocab_file_name) else: neon_logger.display("Building vocab file") # build vocab word_count = defaultdict(int) for sent in all_sent: sent_words = tokenize(sent) if len(sent_words) > max_len_w or len(sent_words) == 0: continue for word in sent_words: word_count[word] += 1 # sort the word_count , re-assign ids by its frequency. Useful for downstream tasks # only done for train vocab vocab_sorted = sorted(word_count.items(), key=lambda kv: kv[1], reverse=True) vocab = OrderedDict() # get word count as array in same ordering as vocab (but with maximum length) word_count_ = np.zeros((len(word_count), ), dtype=np.int64) for i, t in enumerate(list(zip(*vocab_sorted))[0][:max_vocab_size]): word_count_[i] = word_count[t] vocab[t] = i word_count = word_count_ # generate the reverse vocab rev_vocab = dict((wrd_id, wrd) for wrd, wrd_id in vocab.items()) neon_logger.display("vocabulary from {} is saved into {}".format(path, vocab_file_name)) save_obj((vocab, rev_vocab, word_count), vocab_file_name) vocab_size = len(vocab) neon_logger.display("\nVocab size from the dataset is: {}".format(vocab_size)) neon_logger.display("\nProcessing and saving training data into {}".format(h5_file_name)) # now process and save the train/valid data h5f = h5py.File(h5_file_name, 'w', libver='latest') shape, maxshape = (len(train_sent),), (None) dt = np.dtype([('text', h5py.special_dtype(vlen=str)), ('num_words', np.uint16)]) report_text_train = h5f.create_dataset('report_train', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') report_train = h5f.create_dataset('train', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') # map text to integers wdata = np.zeros((1, ), dtype=dt) ntrain = 0 for sent in train_sent: text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)] # enforce maximum sentence length if len(text_int) > max_len_w or len(text_int) == 0: continue report_train[ntrain] = text_int wdata['text'] = clean_string(sent) wdata['num_words'] = len(text_int) report_text_train[ntrain] = wdata ntrain += 1 report_train.attrs['nsample'] = ntrain report_train.attrs['vocab_size'] = vocab_size report_text_train.attrs['nsample'] = ntrain report_text_train.attrs['vocab_size'] = vocab_size if valid_split: neon_logger.display("\nProcessing and saving validation data into {}".format(h5_file_name)) shape = (len(valid_sent),) report_text_valid = h5f.create_dataset('report_valid', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') report_valid = h5f.create_dataset('valid', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') nvalid = 0 for sent in valid_sent: text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)] # enforce maximum sentence length if len(text_int) > max_len_w or len(text_int) == 0: continue report_valid[nvalid] = text_int wdata['text'] = clean_string(sent) wdata['num_words'] = len(text_int) report_text_valid[nvalid] = wdata nvalid += 1 report_valid.attrs['nsample'] = nvalid report_valid.attrs['vocab_size'] = vocab_size report_text_valid.attrs['nsample'] = nvalid report_text_valid.attrs['vocab_size'] = vocab_size h5f.close() return h5_file_name, vocab_file_name
def _get_downloadable_from_url(video_url, resolution): global RESOLUTION global ERROR_MSG global HEADER # 解析m3u8地址 start_time = time.time() while True: try: r = requests.get(video_url, headers=HEADER) break except requests.ConnectionError: if time.time() > start_time + CONNECTION_TIMEOUT: raise Exception("Unable to get video_url %s \nafter %s seconds of ConnectionErrors" \ % (video_url,CONNECTION_TIMEOUT)) else: time.sleep(1) video_html = etree.HTML(r.text) title = video_html.xpath('//span[@class="title1"]')[0].text # 《命运的X号》剧场公演 info = video_html.xpath( '//span[@class="title2"]')[0].text # TeamX 剧场公演 2018.01.04 # 文件名 fname = title if not fname.startswith("《"): fname = "《" + fname + "》" date_string = util.crush_time( info)[:10] # if no date found, use info[:10] part fname = date_string + ' ' + fname # 《48狼人杀》 20180202 if "星梦Mini" in fname: fname = fname + ' ' + re.sub( '本期成员:', '', re.search(r'.*' + date_string[:4], info).group(0)[:-4]) if "48狼人杀" in fname or "公演" in fname: fname = fname + ' ' + re.search(r'.*' + date_string[:4], info).group(0)[:-4] fname = util.clean_string(fname, 'filename') chao_url = video_html.xpath('//input[@id="chao_url"]/@value')[0] gao_url = video_html.xpath('//input[@id="gao_url"]/@value')[0] liuchang_url = video_html.xpath('//input[@id="liuchang_url"]/@value')[0] # 默认:超清源 RESOLUTION = resolution if RESOLUTION == 'chaoqing': if chao_url == "" or requests.get( chao_url, timeout=CONNECTION_TIMEOUT).text == "\n": print("未找到超清源,降低视频清晰度") RESOLUTION = "gaoqing" m3u8_url = gao_url else: m3u8_url = chao_url if RESOLUTION == 'gaoqing': if not gao_url or requests.get(gao_url, timeout=CONNECTION_TIMEOUT, headers=HEADER).text == "\n": print("未找到高清源,降低视频清晰度") RESOLUTION = "liuchang" m3u8_url = liuchang_url else: m3u8_url = gao_url if RESOLUTION == 'liuchang': if not liuchang_url or requests.get(liuchang_url, timeout=CONNECTION_TIMEOUT, headers=HEADER).text == "\n": print("未找到流畅源,skip current operation: %s" % title) return {} # return empty object else: m3u8_url = liuchang_url # 解析当页可用视频m3u8 ts_list = _get_ts_from_m3u8(m3u8_url) print("已解析: %s" % fname) return { 'title': title, 'info': info, 'fname': fname, 'm3u8_url': m3u8_url, 'site_url': video_url, 'ts_list': ts_list }
# "Awards", "Notes") output = csv.DictWriter(f, headers) output.writeheader() events = soup.find("section", "section activities-block refined-search-container").find("div", {"id": "lpf-tabs2-a"}).find_all("article") for event in events: event_name = get_text_if_exists(event, "h5", {"class": "title"}) event_date = event.find("span", {"itemprop": "startDate"})["content"].split("T")[0] lat, long = [float(x) for x in event["data-geo-point"].split(',')] detail_url = 'http://www.active.com' + event.find("a", "ie-article-link" )["href"] types = get_text_if_exists(event, "h6", {"class":"secondary-text desc-info pull-left"}) detail_soup = BeautifulSoup(urlopen(detail_url).read()) event_day = get_text_if_exists(detail_soup.find("div", "visible-desktop"), "h5").split(",")[0] address_name = get_text_if_exists(detail_soup, "span", {"itemprop": "name"}) address = clean_string(get_text_if_exists(detail_soup, "span", {"itemprop": "address"}), utf8=True) notes = clean_string(get_text_if_exists(detail_soup, "div", {"itemprop": "description"}), utf8=True) prices = [] has_prices = detail_soup.find("div", "price-grid") if has_prices: name_prices = has_prices.find_all("div", "row price-row") for name_price in name_prices: event_type = get_text_if_exists(name_price, "h5", {"itemprop": "name"}) price = get_text_if_exists(name_price, "h5", {"itemprop": "Price"}) prices.append((event_type, price)) event_dict = {"Date": None, "Day": event_day, "Event Name": event_name, "url": detail_url, "Types": types, "Location": address,
def parse(self, response): url = response.url self.logger.info(f"Inside parse for {url}") GROCERY_SELECTOR = '[data-automation-id="productTile"]' SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]' GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR metadata=get_url_metadata(self.cursor,url) section=metadata[1] subsection=metadata[2] for grocery in response.css(GROCERIES_SELECTOR): NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)' name = grocery.css(NAME_SELECTOR).extract_first() #parse the ounces off of the name decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)" ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)", name, re.IGNORECASE) pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?", name, re.IGNORECASE) count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))", name, re.IGNORECASE) self.ounce = ounces self.pounds = pounds self.count = count #Check if the arrays returned from re.findall are empty if ounces: ounces = parse_float(ounces[0]) else: ounces = 0 if pounds: pounds = parse_float(pounds[0]) else: pounds = 0 if count: count = parse_float(count[0]) else: count = 0 if pounds != 0: ounces = 16*pounds elif count != 0: ounces *= count # inspect_response(response,self) SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text' PRICE_SELECTOR = '[data-automation-id="price"] ::text' PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text' name=grocery.css(NAME_SELECTOR).extract_first() name=clean_string(name,"\"") ounces=ounces pounds=pounds count=count price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','') ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()) yield { 'name': name, 'ounces': ounces, 'pounds': pounds, 'count': count, 'price': price, 'price-per-unit': ppu, 'section': section, 'subsection': subsection, 'url': url, } finish_url(self.conn,self.store_id,url) next_url=get_next_url(self.cursor,1,store_id=self.store_id,filter="aisle=") print(f"next_url - {next_url}") if next_url is None: print ("No more urls - finishing") else: request = create_parse_request(next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[aria-current="page"]')), meta_url=next_url) yield request
def set_value(self,pv=None,**kws): v = [clean_string(i) for i in [pv.value,pv.char_value,time.time()]] v.append(pv.pvname) qval = "update cache set value=%s,cvalue=%s,ts=%s where pvname='%s'" % tuple(v) self.db.execute(qval)