def parse_date(self, test_string): test_string = DataUtils.remove_excess_spaces(test_string) # First, try to parse the date according the the specified format parsed_date = self.parse_date_string(test_string) if parsed_date != None: return parsed_date, parsed_date try: # If that fails, try to parse the date as a date range string return daterangeparser.parse(test_string) except pyparsing.ParseException: # If that fails, it may be a date range in a format that daterangeparser doesn't recognize # Check if the string contains two formatted dates by checking the beginning and end substrings # until it finds two strings formatted like dates test_start = len(test_string) - 1 test_end = 0 start = None end = None while test_end < len(test_string): if start == None: start = self.parse_date_string(test_string[0:test_end]) if end == None: end = self.parse_date_string( test_string[test_start:len(test_string)]) if start != None and end != None: break test_start -= 1 test_end += 1 if start == None or end == None: raise ValueError('Could not parse date string: ' + test_string) return start, end
def extract_events_spacy(line, nlp): line = p.sub('', line) line = line.replace(',', '').replace('.', '').replace('\n', '').replace('\\', '').replace('/', '').replace('\'', '') events = [] doc = nlp(line) for ent in filter(lambda e: e.label_ == 'DATE', list(doc.ents)): try: (start, end) = parse(ent.text) except Exception as e: logging.debug(e) # could not parse the dates, hence ignore it continue current = ent.root while current.dep_ != 'ROOT': current = current.head desc = ' '.join(filter(None, [ dep_subtree(current, 'nsubj'), dep_subtree(current, 'nsubjpass'), dep_subtree(current, 'auxpass'), dep_subtree(current, 'amod'), dep_subtree(current, 'det'), current.text, dep_subtree(current, 'acl'), dep_subtree(current, 'dobj'), dep_subtree(current, 'attr'), dep_subtree(current, 'advmod'), ])) events = events + [(start, ent.text, desc)] return events
def process_item(self, item, spider): start, end = parse(item['date']) item['start'] = start.strftime('%Y-%m-%d %H:%M:%S') if not end == None: item['end'] = end.strftime('%Y-%m-%d %H:%M:%S') else: item['end'] = "null" return item
def findDate (eventelem): #default value datestring = '' if eventelem.nextSibling: if re.match('\s?\(([^)]+)\)?', str(eventelem.nextSibling)): #match parenthesis content at the begginning paren = re.match('\s?\(([^)]+)\)?', str(eventelem.nextSibling)).group(1) #match date substring if re.search('([a-zA-Z]+\.?\s?(\d).*(\d))',paren): date = re.search('([a-zA-Z]+\.?\s?(\d).*(\d))',paren).group(1) #get rid of bad dash date = re.sub(u"\u2013", "-", date) try: if daterangeparser.parse(date): datestring = str(daterangeparser.parse(date)[0].date()) except: print ("--Date Parse Exception--") return datestring
def validate(self, value): if (isinstance(value, basestring) is False): return False if re.match(r'[\d]+(?:-|_|\.|\/)[\d]+(?:-|_|\.|\/)[\d]+', value): range_match = re.findall( r'([\d]+(?:-|_|\.|\/)[\d]+(?:-|_|\.|\/)[\d]+)', value) if len(range_match) == 1: self.parsed_date = { "start": range_match[0], "end": range_match[0] } else: self.parsed_date = { "start": range_match[0], "end": range_match[1] } return True elif re.match( r'(?:((?<![\d])[\d]{1,2})(?:-|_|\.|\/)([\d]+)|([\d]+)(?:-|_|\.|\/)([\d]{1,2})(?![\d]))', value): range_match = re.findall(r'([\d]+(?:-|_|\.|\/)[\d]+)', value) elif re.match(r'[a-zA-Z\.]+ [\d,]+ [\d]+', value): range_match = re.findall(r'([a-zA-Z\.]+ [\d,]+ [\d]+)', value) elif re.match(r'[\d]+ [a-zA-Z\.]+ [\d]+', value): range_match = re.findall(r'([\d]+ [a-zA-Z\.]+ [\d]+)', value) elif re.match(r'[\d]{4}', value): range_match = [ value, value ] # Force pyparsing to handle any YYYY or YYYY-YYYY values as dateutil mangles them else: return False d = None if len(range_match) == 1: try: dx = dateutil.parser.parse(value) except ValueError: return False self.parsed_date = {"start": str(dx), "end": str(dx)} else: try: d = parse(value, allow_implicit=True) except ParseException as e: return False if d is None: #return ["Invalid date"] return False self.parsed_date = {"start": str(d[0]), "end": str(d[1])} self.value_parsed = value print self.parsed_date return True
def subpage(iurl): print "Opening ", iurl retry = 0 while True: try: r = requests.get(iurl) break except requests.ConnectionError: print "FAILED", retry retry = retry + 1 if retry > 5: return "", "", "", "", "", "", "" continue stew = BeautifulSoup(r.text, "lxml") td = stew.find("div", {"class": "content_area"}) title = td.h1.contents tdr = td.find_all("p") print "number of p is ", len(tdr) fburl = "" emurl = "" sdate = "" edate = "" for t in tdr: if not t.span: continue label = "".join(t.span.contents) print "Check label ", label if label == "More info:": url = t.a["href"] if label == "Location:": where = t.find_all("a") country = "".join(where[1].contents) city = "".join(where[0].contents) if label == "Dates:": t.span.extract() when = t.get_text() sdate, edate = daterangeparser.parse(when) sdate = sdate.strftime("%Y-%m-%d") edate = edate.strftime("%Y-%m-%d") print "Time is ", when print "Start is ", sdate print "End is ", edate return sdate, edate, country, city, url, fburl, emurl
def parse_date(self, test_string): test_string = DataUtils.remove_excess_spaces(test_string) # First, try to parse the date according the the specified format parsed_date = self.parse_date_string(test_string) if parsed_date != None: return parsed_date, parsed_date # If that fails, try to parse the date with fuzzy matching (needed for weird formats or date ranges) # timefhuman and daterangeparser are both libraries to do this, but they each support different cases try: fuzzy_parsed = timefhuman(test_string) except: fuzzy_parsed = daterangeparser.parse(test_string) if len(fuzzy_parsed) == 1: return fuzzy_parsed, fuzzy_parsed elif len(fuzzy_parsed) == 2: return fuzzy_parsed # If that fails, it may be a date range in a format that daterangeparser doesn't recognize # Check if the string contains two formatted dates by checking the beginning and end substrings # until it finds two strings formatted like dates test_start = len(test_string) - 1 test_end = 0 start = None end = None while test_end < len(test_string): if start == None: start = self.parse_date_string(test_string[0:test_end]) if end == None: end = self.parse_date_string( test_string[test_start:len(test_string)]) if start != None and end != None: break test_start -= 1 test_end += 1 if start == None or end == None: raise ValueError('Could not parse date string: ' + test_string) return start, end
def validate(self, value): if(isinstance(value, basestring) is False): return False if re.match(r'[\d]+(?:-|_|\.|\/)[\d]+(?:-|_|\.|\/)[\d]+', value): range_match = re.findall(r'([\d]+(?:-|_|\.|\/)[\d]+(?:-|_|\.|\/)[\d]+)', value) if len(range_match) == 1: self.parsed_date = {"start": range_match[0], "end": range_match[0]} else: self.parsed_date = {"start": range_match[0], "end": range_match[1]} return True elif re.match(r'(?:((?<![\d])[\d]{1,2})(?:-|_|\.|\/)([\d]+)|([\d]+)(?:-|_|\.|\/)([\d]{1,2})(?![\d]))', value): range_match = re.findall(r'([\d]+(?:-|_|\.|\/)[\d]+)', value) elif re.match(r'[a-zA-Z\.]+ [\d,]+ [\d]+', value): range_match = re.findall(r'([a-zA-Z\.]+ [\d,]+ [\d]+)', value) elif re.match(r'[\d]+ [a-zA-Z\.]+ [\d]+', value): range_match = re.findall(r'([\d]+ [a-zA-Z\.]+ [\d]+)', value) elif re.match(r'[\d]{4}', value): range_match = [value, value] # Force pyparsing to handle any YYYY or YYYY-YYYY values as dateutil mangles them else: return False d = None if len(range_match) == 1: try: dx = dateutil.parser.parse(value) except ValueError: return False self.parsed_date = {"start": str(dx), "end": str(dx)} else: try: d = parse(value, allow_implicit=True) except ParseException as e: return False if d is None: #return ["Invalid date"] return False self.parsed_date = {"start": str(d[0]), "end": str(d[1])} self.value_parsed = value print self.parsed_date return True
def get_calendar_table(url, fixfun=None): """ Scraps calendar info from the given url. You can pass a function that fixes bad date ranges. """ if fixfun is None: fixfun = lambda rg: rg res = requests.get(url) res.raise_for_status() calendar_page = bs4.BeautifulSoup(res.text, "lxml") head, body = extract_table_parts(calendar_page) # Parse the table into dict of dicts, one for each semester: colnames = [col.getText() for col in head.find_all("th")] colcnt = len(colnames) semesters = {s: {} for s in colnames[1:]} for row in body.find_all("tr"): for i, cell in enumerate(row.find_all("td")): if i == 0: #key = unicodedata.normalize("NFKD",cell.getText()) key = unidecode(cell.getText()) elif i < colcnt: #dates = unicodedata.normalize("NFKD",cell.getText()).strip(" ") # Strip stupid comments they sometimes put in there stupid = cell.find("strong") if stupid: _ = stupid.extract() dates = unidecode(cell.getText()).strip(whitespace) if dates != "": daterange = parse(dates) semesters[colnames[i]][key] = fixfun(daterange) return semesters
try: c = soup.find_all("div", {"class": "financial_data"})[0].find_all( "div", {"class": "row"})[0].find_all( "div", {"class": "col_2 expand"})[0].find_all()[2].get_text() except: pass if a == 'Time': Time = b + ' ' + c print(b, c) try: test = ' - '.join([ dt.strptime(i, '%Y-%m-%d').strftime('%d %b %Y') for i in c.split(' - ') ]) s, e = parse(test) start = s.strftime('%Y-%m-%d') end = e.strftime('%Y-%m-%d') except: pass print(start, end) #financials financials = soup.find_all("div", {"class": "financial_data"})[0].find_all( "div", {"class": "data_row"}) #for row in financials: # row.find_all("div",{"class":"col_2"})[0].prettify() # row.find_all("div",{"class":"col_2"})[1].prettify()
def scrape_link_details(driver, link): """Opens a link to a listing and scrapes all of the pertinent details. Returns 1) the number of sales made by the shop, 2) the number of this item currently in people's baskets, 3) the description of the item, 4) the average number of days between today and when the item arrives, 5) the cost of delivery, 6) whether returns are accepted, 7) the country where the item is dispatched from, and 8) how many images the listing has. """ for i in range(3): try: random_sleep_link = random.uniform(5, 7) time.sleep(random_sleep_link) windows_before = driver.current_window_handle driver.execute_script("window.open('" + link + "');") print('opened window') windows_after = driver.window_handles new_window = [x for x in windows_after if x != windows_before][0] print('got new deets') driver.switch_to.window(new_window) loaded = WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, "gnav-search"))) try: sales = loaded.find_elements_by_xpath( "//div[starts-with(@class, 'wt-display-inline-flex-xs wt-align-items-center')]/a/span[1]" ) s = sales[0].text num_sales = s.split(" ")[0] except: num_sales = 0 try: basket = loaded.find_elements_by_xpath( "//p[@class='wt-position-relative wt-text-caption']") x = basket[0].text y = [int(i) for i in x.split() if i.isdigit()] for i in y: num_basket = i except: num_basket = 0 try: description = loaded.find_element_by_xpath( "//meta[@name='description']") descriptions = description.get_attribute("content") except: descriptions = np.nan try: arrival = loaded.find_element_by_xpath( "//*[@id='shipping-variant-div']/div/div[2]/div[1]/div/div[1]/p" ) arrival_range = arrival.text start, end = parse(arrival_range) average = start + (end - start) / 2 today = datetime.date.today() diff = average.date() - today days_to_arrival = diff.days except: days_to_arrival = np.nan try: delivery = loaded.find_element_by_xpath( "//*[contains(text(), 'Cost to deliver')]/following-sibling::p" ).text if delivery == 'Free': cost_delivery = 0 else: match = re.search(r'\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})', delivery).group(0) cost_delivery = float(match) except: cost_delivery = np.nan try: loaded.find_element_by_xpath( "//*[contains(text(), 'Accepted')]") returns_accepted = 1 except: returns_accepted = 0 try: dispatch = loaded.find_element_by_xpath( "//*[@id='shipping-variant-div']/div/div[2]/div[7]").text d_split = dispatch.split(" ")[2:] d_join = " ".join(d_split) dispatch_from = d_join except: dispatch_from = np.nan try: images = loaded.find_element_by_xpath( "//ul[starts-with(@class, 'wt-list-unstyled wt-display-flex-xs')]" ) i_list = images.find_elements_by_xpath( "//li[@class='wt-mr-xs-1 wt-mb-xs-1 wt-bg-gray wt-flex-shrink-xs-0 wt-rounded carousel-pagination-item-v2']" ) count_images = len(i_list) except: count_images = 1 driver.close() # close the window driver.switch_to.window( windows_before) # switch_to the parent_window_handle print('switched') except requests.exceptions.RequestException: #if anything weird happens...# random_sleep_except = random.uniform(240, 360) print("I've encountered an error! I'll pause for" + str(random_sleep_except / 60) + " minutes and try again \n") time.sleep( random_sleep_except) #sleep the script for x seconds and....# continue #...start the loop again from the beginning# else: #if the try-part works...# break #...break out of the loop# print('broke out of the loop') else: #if x amount of retries on the try-part don't work...# raise Exception("Something really went wrong here... I'm sorry." ) #...raise an exception and stop the script# return num_sales, num_basket, descriptions, days_to_arrival, cost_delivery, returns_accepted, dispatch_from, count_images
def subpage_data(url): price = '-' goal = '-' date = '' token = '-' start = '' end = '' html = requests.get(url).content #print(html) soup = BeautifulSoup(html, "lxml") #print(soup) try: pr = soup.findAll('div', class_='col-12 col-md-6')[0].findAll('li')[2].text if 'ICO Token Price:' in pr: splitted = pr.split() price = splitted[6] else: price = '-' except: price = '-' try: text = soup.find('div', class_='ico-right-col') goal = clean( text.find('div', class_="goal").text.split("(")[0].strip()) except: goal = '-' try: tk = soup.findAll('div', class_='col-12 col-md-6')[0].findAll('li')[0].text if 'Ticker:' in tk: splitted = tk.split() token = splitted[1] else: token = '-' except: token = '-' try: text = soup.find_all("div", {"class": "rating-item"})[0] hype = text.find('p', class_="rate").text.strip() except: hype = '-' try: text = soup.find_all("div", {"class": "rating-item"})[1] risk = text.find('p', class_="rate").text.strip() except: risk = '-' try: text = soup.find_all("div", {"class": "rating-item"})[2] roi = text.find('p', class_="rate").text.strip() except: roi = '-' try: text = soup.find('div', class_='rating-result') icorate = text.find('p', class_="ico-rate").text.strip() except: icorate = '-' try: text = soup.find("div", {"class": "button"}).parent['href'] spltAr = text.split("://") spltAr = re.sub(r'www/.', '', spltAr) i = (0, 1)[len(spltAr) > 1] domain = spltAr[i].split("?")[0].split('/')[0].split(':')[0].lower() except: domain = '-' try: text = soup.find_all( "div", {"class": "col-12 title-h4"})[0].findAll('h4')[0].text.strip() text2 = re.sub(r'[\t\n\r]*', '', text) text3 = re.sub(r'Token Sale: ', '', text2) text4 = re.sub(r'\(.*\)', '', text3) text5 = re.sub(r'since', '', text4) text6 = re.sub(r'Market & Returns', '', text5) date = re.sub(r'period isn\'t set', '', text6) except: date = '' try: s, e = parse(date) start = s.strftime('%Y-%m-%d') end = e.strftime('%Y-%m-%d') except: pass return [ price, goal, date, start, end, token, hype, risk, roi, icorate, domain ]
'td')).find_all('td')[0].text.encode('utf-8').strip() '''if b'date:' in th.lower(): date = parsed.replace(b'\n',b'').replace(b'\r',b'').decode('utf-8') elif b'time:' in th.lower(): time = parsed.replace(b'\n',b'').replace(b'\r',b'').decode('utf-8') el''' if b'location:' in th.lower(): location = parsed.replace(b'\n', b'').replace(b'\r', b'').decode('utf-8') elif b'\xc2' in th.lower(): description = parsed.decode('utf-8') if any(s in description.lower() for s in strings): print(title + " on " + date + " at " + time + " at " + location) print(description) print(str(date + " at " + time)) start, end = parse(str(date + " at " + time)) print("Start = " + start) print("End = " + end) data = { "Date": date, "Time": time, "Description": description, "Location": location, "Title": title } #uncomment to push to db #db.child("events").push(data);