def preprocess_tokens(tokens): ''' Remove any extra lines, non-letter characters, and blank quotes ''' remove_new_lines = [re.sub('\s+', '', token) for token in tokens] #Remove non letter characters non_letters = [re.sub('[^a-zA-Z]', '', remove_new_line) for remove_new_line in remove_new_lines] #Remove distracting single quotes remove_quotes = [re.sub("\'", '', non_letter) for non_letter in non_letters] #Removes empty strings from a list of strings final = list(filter(None, remove_quotes)) return final
def history(self) -> Tuple[dict, None]: "Gets the package tracking history." if isinstance(self._history, bs4types.Tag): history = self._history.contents.copy() string = [] self._history = { 'events': [] } for x in history: if isinstance(x, bs4types.Tag): if x.name == 'hr': event = re.sub(r'[\t\n\r]*', '', ' '.join(string)) event = event.strip().split(' ') self._history['events'].append({ 'date': event[0], 'status': event[1], 'location': event[2] if len(event) >= 3 else None }) string.clear() else: string.append(x.get_text().replace('\xa0', ' ')) return self._history
def product_info(self): if isinstance(self._product_info, bs4types.Tag): html = self._product_info.find('li').get_text() self._product_info = { 'postal_product': re.sub(r'[\t\n\r]*', '', html).split(':')[1] } return self._product_info
def status_last_updated(self) -> Tuple[str, None]: "Gets the date of the most recent status update" if isinstance(self._status_last_updated, bs4types.Tag): element = self._status_last_updated.find('p') child = list(element.children)[0] text = ' '.join(re.sub(r'[\t\r\n]*', '', child).split()).strip() self._status_last_updated = text if text else None return self._status_last_updated
def get_zones_dict(df): selected = df[columns] temp_df = df["gate_arrival_actual_timezone_code"] for i in range(1, len(columns)): temp_df = temp_df.append(selected[columns[i]], ignore_index=True) zones = [z.lower() for z in temp_df.unique() if "GMT" not in z] zones_dict = {} for zone in zones: url = 'https://www.timeanddate.com/time/zones/' + zone page = rq.get(url) content = page.content soup = BeautifulSoup(content, 'html.parser') scraped_zone = soup.find_all("ul", {"class": "clear"}) if len(scraped_zone) > 0: p = re.compile(r'UTC [+-][0-9]{1,2}\b') search = p.search(scraped_zone[0].text) group = search.group(0) result = re.sub('[\s]', '', group) zones_dict[zone] = result.replace("UTC", "Etc/GMT") return zones_dict
def search_ozbargain(): # We need to reiterate that suburb is global.. for some reason global suburb # Find 2 or 3 letters in the title surrounded by square brackets #regex_search = "(?=\[[A-Z]{2,3}\])" # The link to the deals page url = requests.get("https://www.ozbargain.com.au/deals").text # Open the deals page with BeautifulSoup soup = BeautifulSoup(url, "html.parser") # Find all deal posts with a title class response = soup.findAll('h2', class_="title") # Create a list to store all of the titles in case there is more than 1 current deal title = [] for title_text in response: # If the words "7-Eleven" and "Fuel" is in the title, add it to our list # Adding "Fuel" should get rid of other deals posted if "7-Eleven" and "Fuel" in title_text.text: title.append(title_text.text) # Set suburb to none in case we don't find one later suburb = None # Search for the store location in the title of each deal for i in title: # Split after 7-Eleven to grab the store name (ignore text case) title_search = re.split("@ 7-Eleven", i, re.IGNORECASE) try: # Find a comma in suburb = re.sub("[^#@0-9A-Za-z ]+", "", title_search[1]).strip() except: suburb = title_search[1].strip() return suburb
q2 = newx.findAll("span", {"class": "m-metadata__content"}, recursion=False) for i in range(0, len(q1)): if ('learn' in (q1[i].get_text()).lower() and 'free' in q2[i].get_text().lower()): free = 1 if 'duration' in ((q1[i].get_text()).lower()): duration = no(q2[i].get_text().lower()) if 'upgrade' in (q1[i].get_text()).lower().strip(): cost = no(q2[i].get_text().lower()) if 'weekly study' in ((q1[i].get_text()).lower()): weekly_study = q2[i].get_text().lower().strip() print(re.sub('\n+', "", q1[i].get_text()), ":", re.sub('\n+', "", q2[i].get_text())) #print("\n\n",course_name,course_by,course_description,link,tag,free,weekly_study,cost,duration,"\n\n") try: start_date = dt = parser.parse( newx.find("time", { "itemprop": "startDate" }).get_text()) except: start_date = 0 try: # Prepare SQL query to INSERT a record into the database. sql = "INSERT INTO course(course_name,institute, description, link,tag,free,weekly_study,cost,duration,website,search,start_date) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # Execute the SQL command cursor.execute(
def get_views(soup): raw_string = soup.find("div", {"class": "watch-view-count"}).text return int(re.sub(r"[^0-9]", "", raw_string))
def get_openTime(date_string): ##### day final_time_list = [] string_lower = date_string.lower() final_time_list = [] if "7 day" in string_lower or "everyday" in string_lower or "7day" in string_lower: for i in range(1,8): final_time_list.append( {"weekday": str(i), "start_time": "", "end_time": ""} ) string_lower = string_lower.replace("7 day", "") string_lower = string_lower.replace("7day", "") string_lower = string_lower.replace("everyday", "") elif "-" in string_lower or "to" in string_lower: toggle = 0 start = "" end = "" weekday_list = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"] dic_weekday = { "mon": 1, "tue": 2, "wed": 3, "thu": 4, "fri": 5, "sat": 6, "sun": 7, } for a in re.sub("[^a-z0-9.:]", " ", string_lower).split(" "): for i in weekday_list: if i in a: if toggle == 0: start = dic_weekday[i] toggle = 1 string_lower = string_lower.replace(i, "") else: end = dic_weekday[i] string_lower = string_lower.replace(i, "") if start < end: for i in range(start, end + 1): final_time_list.append({"weekday": i, "start_time": "", "end_time": ""}) elif end < start: for i in range(start, 8): final_time_list.append({"weekday": i, "start_time": "", "end_time": ""}) for i in range(1, end + 1): final_time_list.append({"weekday": i, "start_time": "", "end_time": ""}) else: if "mon" in re.sub("[^a-z0-9]", "", string_lower): final_time_list.append({"weekday": 1, "start_time": "", "end_time": ""}) if "tue" in re.sub("[^a-z0-9]", "", string_lower): final_time_list.append({"weekday": 2, "start_time": "", "end_time": ""}) if "wed" in re.sub("[^a-z0-9]", "", string_lower): final_time_list.append({"weekday": 3, "start_time": "", "end_time": ""}) if "thu" in re.sub("[^a-z0-9]", "", string_lower): final_time_list.append({"weekday": 4, "start_time": "", "end_time": ""}) if "fri" in re.sub("[^a-z0-9]", "", string_lower): final_time_list.append({"weekday": 5, "start_time": "", "end_time": ""}) if "sat" in re.sub("[^a-z0-9]", "", string_lower): final_time_list.append({"weekday": 6, "start_time": "", "end_time": ""}) if "sun" in re.sub("[^a-z0-9]", "", string_lower): final_time_list.append({"weekday": 7, "start_time": "", "end_time": ""}) ###### time date_string = string_lower.replace(":00", "-").replace(".00", "-") time_list = [s for s in re.sub("[^0-9:.]", "-", date_string).split("-") if s != ""] start_time = 0 end_time = 0 toggle = 0 for i in time_list: try: i = int(i) except: continue if i > 7 and toggle == 0: start_time = i toggle = 1 if i < 4 and toggle == 0: start_time = i + 12 toggle = 1 elif i < 12 and toggle != 0 and i < start_time: end_time = i + 12 elif i <= 12 and toggle != 0: end_time = i if start_time > end_time: tmp = end_time end_time = start_time start_time = tmp if start_time < 12: start_time = str(start_time) + "am" else: if start_time == 12: start_time = str(start_time) + "pm" else: start_time = str(start_time - 12) + "pm" if end_time < 12: end_time = str(end_time) + "am" else: if end_time == 12: end_time = str(end_time) + "pm" else: end_time = str(end_time - 12) + "pm" return_time = [] for i in final_time_list: i["start_time"] = start_time i["end_time"] = end_time return_time.append(i) return return_time