コード例 #1
0
def preprocess_tokens(tokens):
  '''
  Remove any extra lines, non-letter characters, and blank quotes
  '''
  remove_new_lines = [re.sub('\s+', '', token) for token in tokens] 
  #Remove non letter characters
  non_letters = [re.sub('[^a-zA-Z]', '', remove_new_line) for remove_new_line in remove_new_lines]
  #Remove distracting single quotes
  remove_quotes = [re.sub("\'", '', non_letter) for non_letter in non_letters]
  #Removes empty strings from a list of strings
  final = list(filter(None, remove_quotes)) 
  
  return final
コード例 #2
0
    def history(self) -> Tuple[dict, None]:
        "Gets the package tracking history."

        if isinstance(self._history, bs4types.Tag):
            history = self._history.contents.copy()
            string = []
            self._history = {
                'events': []
            }
            for x in history:
                if isinstance(x, bs4types.Tag):
                    if x.name == 'hr':
                        event = re.sub(r'[\t\n\r]*', '', ' '.join(string))
                        event = event.strip().split('  ')
                        self._history['events'].append({
                            'date':
                            event[0],
                            'status':
                            event[1],
                            'location':
                            event[2] if len(event) >= 3 else None
                        })
                        string.clear()
                    else:
                        string.append(x.get_text().replace('\xa0', ' '))
        return self._history
コード例 #3
0
    def product_info(self):
        if isinstance(self._product_info, bs4types.Tag):
            html = self._product_info.find('li').get_text()
            self._product_info = {
                'postal_product': re.sub(r'[\t\n\r]*', '', html).split(':')[1]
            }

        return self._product_info
コード例 #4
0
    def status_last_updated(self) -> Tuple[str, None]:
        "Gets the date of the most recent status update"

        if isinstance(self._status_last_updated, bs4types.Tag):
            element = self._status_last_updated.find('p')
            child = list(element.children)[0]
            text = ' '.join(re.sub(r'[\t\r\n]*', '', child).split()).strip()
            self._status_last_updated = text if text else None

        return self._status_last_updated
コード例 #5
0
 def get_zones_dict(df):
     selected = df[columns]
     temp_df = df["gate_arrival_actual_timezone_code"]
     for i in range(1, len(columns)):
         temp_df = temp_df.append(selected[columns[i]], ignore_index=True)
     zones = [z.lower() for z in temp_df.unique() if "GMT" not in z]
     zones_dict = {}
     for zone in zones:
         url = 'https://www.timeanddate.com/time/zones/' + zone
         page = rq.get(url)
         content = page.content
         soup = BeautifulSoup(content, 'html.parser')
         scraped_zone = soup.find_all("ul", {"class": "clear"})
         if len(scraped_zone) > 0:
             p = re.compile(r'UTC [+-][0-9]{1,2}\b')
             search = p.search(scraped_zone[0].text)
             group = search.group(0)
             result = re.sub('[\s]', '', group)
             zones_dict[zone] = result.replace("UTC", "Etc/GMT")
     return zones_dict
コード例 #6
0
def search_ozbargain():
    # We need to reiterate that suburb is global.. for some reason
    global suburb

    # Find 2 or 3 letters in the title surrounded by square brackets
    #regex_search = "(?=\[[A-Z]{2,3}\])"

    # The link to the deals page
    url = requests.get("https://www.ozbargain.com.au/deals").text
    # Open the deals page with BeautifulSoup
    soup = BeautifulSoup(url, "html.parser")

    # Find all deal posts with a title class
    response = soup.findAll('h2', class_="title")

    # Create a list to store all of the titles in case there is more than 1 current deal
    title = []

    for title_text in response:
        # If the words "7-Eleven" and "Fuel" is in the title, add it to our list
        # Adding "Fuel" should get rid of other deals posted
        if "7-Eleven" and "Fuel" in title_text.text:
            title.append(title_text.text)

    # Set suburb to none in case we don't find one later
    suburb = None

    # Search for the store location in the title of each deal
    for i in title:
        # Split after 7-Eleven to grab the store name (ignore text case)
        title_search = re.split("@ 7-Eleven", i, re.IGNORECASE)
        try:
            # Find a comma in
            suburb = re.sub("[^#@0-9A-Za-z ]+", "", title_search[1]).strip()
        except:
            suburb = title_search[1].strip()

    return suburb
コード例 #7
0
            q2 = newx.findAll("span", {"class": "m-metadata__content"},
                              recursion=False)
            for i in range(0, len(q1)):
                if ('learn' in (q1[i].get_text()).lower()
                        and 'free' in q2[i].get_text().lower()):
                    free = 1
                if 'duration' in ((q1[i].get_text()).lower()):
                    duration = no(q2[i].get_text().lower())

                if 'upgrade' in (q1[i].get_text()).lower().strip():
                    cost = no(q2[i].get_text().lower())

                if 'weekly study' in ((q1[i].get_text()).lower()):
                    weekly_study = q2[i].get_text().lower().strip()

                print(re.sub('\n+', "", q1[i].get_text()), ":",
                      re.sub('\n+', "", q2[i].get_text()))
            #print("\n\n",course_name,course_by,course_description,link,tag,free,weekly_study,cost,duration,"\n\n")
            try:
                start_date = dt = parser.parse(
                    newx.find("time", {
                        "itemprop": "startDate"
                    }).get_text())
            except:
                start_date = 0

        try:
            # Prepare SQL query to INSERT a record into the database.
            sql = "INSERT INTO course(course_name,institute, description, link,tag,free,weekly_study,cost,duration,website,search,start_date) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            # Execute the SQL command
            cursor.execute(
コード例 #8
0
def get_views(soup):
    raw_string = soup.find("div", {"class": "watch-view-count"}).text
    return int(re.sub(r"[^0-9]", "", raw_string))
コード例 #9
0
def get_openTime(date_string):
    ##### day
    final_time_list = []
    string_lower = date_string.lower()
    final_time_list = []

    if "7 day" in string_lower or "everyday" in string_lower or "7day" in string_lower:
        for i in range(1,8):
            final_time_list.append(
                {"weekday": str(i), "start_time": "", "end_time": ""}
            )
            string_lower = string_lower.replace("7 day", "")
            string_lower = string_lower.replace("7day", "")
            string_lower = string_lower.replace("everyday", "")

    elif "-" in string_lower or "to" in string_lower:
        toggle = 0
        start = ""
        end = ""
        weekday_list = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]
        dic_weekday = {
            "mon": 1,
            "tue": 2,
            "wed": 3,
            "thu": 4,
            "fri": 5,
            "sat": 6,
            "sun": 7,
        }
        for a in re.sub("[^a-z0-9.:]", " ", string_lower).split(" "):
            for i in weekday_list:
                if i in a:
                    if toggle == 0:
                        start = dic_weekday[i]
                        toggle = 1
                        string_lower = string_lower.replace(i, "")
                    else:
                        end = dic_weekday[i]
                        string_lower = string_lower.replace(i, "")

        if start < end:
            for i in range(start, end + 1):
                final_time_list.append({"weekday": i, "start_time": "", "end_time": ""})
        elif end < start:
            for i in range(start, 8):
                final_time_list.append({"weekday": i, "start_time": "", "end_time": ""})
            for i in range(1, end + 1):
                final_time_list.append({"weekday": i, "start_time": "", "end_time": ""})

    else:
        if "mon" in re.sub("[^a-z0-9]", "", string_lower):
            final_time_list.append({"weekday": 1, "start_time": "", "end_time": ""})
        if "tue" in re.sub("[^a-z0-9]", "", string_lower):
            final_time_list.append({"weekday": 2, "start_time": "", "end_time": ""})
        if "wed" in re.sub("[^a-z0-9]", "", string_lower):
            final_time_list.append({"weekday": 3, "start_time": "", "end_time": ""})
        if "thu" in re.sub("[^a-z0-9]", "", string_lower):
            final_time_list.append({"weekday": 4, "start_time": "", "end_time": ""})
        if "fri" in re.sub("[^a-z0-9]", "", string_lower):
            final_time_list.append({"weekday": 5, "start_time": "", "end_time": ""})
        if "sat" in re.sub("[^a-z0-9]", "", string_lower):
            final_time_list.append({"weekday": 6, "start_time": "", "end_time": ""})
        if "sun" in re.sub("[^a-z0-9]", "", string_lower):
            final_time_list.append({"weekday": 7, "start_time": "", "end_time": ""})

    ###### time
    date_string = string_lower.replace(":00", "-").replace(".00", "-")
    time_list = [s for s in re.sub("[^0-9:.]", "-", date_string).split("-") if s != ""]
    start_time = 0
    end_time = 0
    toggle = 0

    for i in time_list:
        try:
            i = int(i)
        except:
            continue
        if i > 7 and toggle == 0:
            start_time = i
            toggle = 1
        if i < 4 and toggle == 0:
            start_time = i + 12
            toggle = 1
        elif i < 12 and toggle != 0 and i < start_time:
            end_time = i + 12
        elif i <= 12 and toggle != 0:
            end_time = i

    if start_time > end_time:
        tmp = end_time
        end_time = start_time
        start_time = tmp

    if start_time < 12:
        start_time = str(start_time) + "am"
    else:
        if start_time == 12:
            start_time = str(start_time) + "pm"
        else:
            start_time = str(start_time - 12) + "pm"
    if end_time < 12:
        end_time = str(end_time) + "am"
    else:
        if end_time == 12:
            end_time = str(end_time) + "pm"
        else:
            end_time = str(end_time - 12) + "pm"

    return_time = []
    for i in final_time_list:
        i["start_time"] = start_time
        i["end_time"] = end_time
        return_time.append(i)

    return return_time