Example #1
0
    def parse_date(self, test_string):
        test_string = DataUtils.remove_excess_spaces(test_string)
        # First, try to parse the date according the the specified format
        parsed_date = self.parse_date_string(test_string)
        if parsed_date != None:
            return parsed_date, parsed_date
        try:
            # If that fails, try to parse the date as a date range string
            return daterangeparser.parse(test_string)

        except pyparsing.ParseException:
            # If that fails, it may be a date range in a format that daterangeparser doesn't recognize
            # Check if the string contains two formatted dates by checking the beginning and end substrings
            # until it finds two strings formatted like dates
            test_start = len(test_string) - 1
            test_end = 0
            start = None
            end = None
            while test_end < len(test_string):
                if start == None:
                    start = self.parse_date_string(test_string[0:test_end])
                if end == None:
                    end = self.parse_date_string(
                        test_string[test_start:len(test_string)])

                if start != None and end != None:
                    break

                test_start -= 1
                test_end += 1

            if start == None or end == None:
                raise ValueError('Could not parse date string: ' + test_string)

            return start, end
Example #2
0
def extract_events_spacy(line, nlp):
    line = p.sub('', line)
    line = line.replace(',', '').replace('.', '').replace('\n', '').replace('\\', '').replace('/', '').replace('\'', '')
    events = []
    doc = nlp(line)
    for ent in filter(lambda e: e.label_ == 'DATE', list(doc.ents)):
        try:
            (start, end) = parse(ent.text)
        except Exception as e:
            logging.debug(e)
      # could not parse the dates, hence ignore it

            continue
        current = ent.root
        while current.dep_ != 'ROOT':
            current = current.head
        desc = ' '.join(filter(None, [
            dep_subtree(current, 'nsubj'),
            dep_subtree(current, 'nsubjpass'),
            dep_subtree(current, 'auxpass'),
            dep_subtree(current, 'amod'),
            dep_subtree(current, 'det'),
            current.text,
            dep_subtree(current, 'acl'),
            dep_subtree(current, 'dobj'),
            dep_subtree(current, 'attr'),
            dep_subtree(current, 'advmod'),
            ]))
        events = events + [(start, ent.text, desc)]
    return events
Example #3
0
 def process_item(self, item, spider):
     start, end = parse(item['date'])
     item['start'] = start.strftime('%Y-%m-%d %H:%M:%S')
     if not end == None:
       item['end'] = end.strftime('%Y-%m-%d %H:%M:%S')
     else:
       item['end'] = "null"
   
     return item
Example #4
0
def findDate (eventelem):
    #default value
    datestring = ''
    if eventelem.nextSibling:
        if re.match('\s?\(([^)]+)\)?', str(eventelem.nextSibling)):
            #match parenthesis content at the begginning
            paren = re.match('\s?\(([^)]+)\)?', str(eventelem.nextSibling)).group(1)

            #match date substring
            if re.search('([a-zA-Z]+\.?\s?(\d).*(\d))',paren):
                date = re.search('([a-zA-Z]+\.?\s?(\d).*(\d))',paren).group(1)
                #get rid of bad dash
                date = re.sub(u"\u2013", "-", date)
                try:
                    if daterangeparser.parse(date):
                        datestring = str(daterangeparser.parse(date)[0].date())
                except:
                    print ("--Date Parse Exception--")
    return datestring
Example #5
0
    def validate(self, value):
        if (isinstance(value, basestring) is False):
            return False
        if re.match(r'[\d]+(?:-|_|\.|\/)[\d]+(?:-|_|\.|\/)[\d]+', value):
            range_match = re.findall(
                r'([\d]+(?:-|_|\.|\/)[\d]+(?:-|_|\.|\/)[\d]+)', value)
            if len(range_match) == 1:
                self.parsed_date = {
                    "start": range_match[0],
                    "end": range_match[0]
                }
            else:
                self.parsed_date = {
                    "start": range_match[0],
                    "end": range_match[1]
                }
            return True
        elif re.match(
                r'(?:((?<![\d])[\d]{1,2})(?:-|_|\.|\/)([\d]+)|([\d]+)(?:-|_|\.|\/)([\d]{1,2})(?![\d]))',
                value):
            range_match = re.findall(r'([\d]+(?:-|_|\.|\/)[\d]+)', value)
        elif re.match(r'[a-zA-Z\.]+ [\d,]+ [\d]+', value):
            range_match = re.findall(r'([a-zA-Z\.]+ [\d,]+ [\d]+)', value)
        elif re.match(r'[\d]+ [a-zA-Z\.]+ [\d]+', value):
            range_match = re.findall(r'([\d]+ [a-zA-Z\.]+ [\d]+)', value)
        elif re.match(r'[\d]{4}', value):
            range_match = [
                value, value
            ]  # Force pyparsing to handle any YYYY or YYYY-YYYY values as dateutil mangles them

        else:
            return False
        d = None
        if len(range_match) == 1:
            try:
                dx = dateutil.parser.parse(value)
            except ValueError:
                return False
            self.parsed_date = {"start": str(dx), "end": str(dx)}
        else:
            try:
                d = parse(value, allow_implicit=True)
            except ParseException as e:
                return False
            if d is None:
                #return ["Invalid date"]
                return False
            self.parsed_date = {"start": str(d[0]), "end": str(d[1])}

        self.value_parsed = value
        print self.parsed_date
        return True
Example #6
0
def subpage(iurl):
    print "Opening ", iurl
    retry = 0
    while True:
        try:
            r = requests.get(iurl)
            break
        except requests.ConnectionError:
            print "FAILED", retry
            retry = retry + 1
            if retry > 5:
                return "", "", "", "", "", "", ""
            continue
    stew = BeautifulSoup(r.text, "lxml")
    td = stew.find("div", {"class": "content_area"})
    title = td.h1.contents
    tdr = td.find_all("p")
    print "number of p is ", len(tdr)
    fburl = ""
    emurl = ""
    sdate = ""
    edate = ""

    for t in tdr:
        if not t.span:
            continue
        label = "".join(t.span.contents)
        print "Check label ", label
        if label == "More info:":
            url = t.a["href"]
        if label == "Location:":
            where = t.find_all("a")
            country = "".join(where[1].contents)
            city = "".join(where[0].contents)
        if label == "Dates:":
            t.span.extract()
            when = t.get_text()
            sdate, edate = daterangeparser.parse(when)
            sdate = sdate.strftime("%Y-%m-%d")
            edate = edate.strftime("%Y-%m-%d")

            print "Time is ", when
            print "Start is ", sdate
            print "End is ", edate
    return sdate, edate, country, city, url, fburl, emurl
Example #7
0
    def parse_date(self, test_string):
        test_string = DataUtils.remove_excess_spaces(test_string)
        # First, try to parse the date according the the specified format
        parsed_date = self.parse_date_string(test_string)
        if parsed_date != None:
            return parsed_date, parsed_date
        # If that fails, try to parse the date with fuzzy matching (needed for weird formats or date ranges)
        # timefhuman and daterangeparser are both libraries to do this, but they each support different cases
        try:
            fuzzy_parsed = timefhuman(test_string)
        except:
            fuzzy_parsed = daterangeparser.parse(test_string)
        if len(fuzzy_parsed) == 1:
            return fuzzy_parsed, fuzzy_parsed
        elif len(fuzzy_parsed) == 2:
            return fuzzy_parsed

        # If that fails, it may be a date range in a format that daterangeparser doesn't recognize
        # Check if the string contains two formatted dates by checking the beginning and end substrings
        # until it finds two strings formatted like dates
        test_start = len(test_string) - 1
        test_end = 0
        start = None
        end = None
        while test_end < len(test_string):
            if start == None:
                start = self.parse_date_string(test_string[0:test_end])
            if end == None:
                end = self.parse_date_string(
                    test_string[test_start:len(test_string)])

            if start != None and end != None:
                break

            test_start -= 1
            test_end += 1

        if start == None or end == None:
            raise ValueError('Could not parse date string: ' + test_string)

        return start, end
    def validate(self, value):
        if(isinstance(value, basestring) is False):
            return False
        if re.match(r'[\d]+(?:-|_|\.|\/)[\d]+(?:-|_|\.|\/)[\d]+', value):
            range_match = re.findall(r'([\d]+(?:-|_|\.|\/)[\d]+(?:-|_|\.|\/)[\d]+)', value)
            if len(range_match) == 1:
                self.parsed_date = {"start": range_match[0], "end": range_match[0]}
            else:
                self.parsed_date = {"start": range_match[0], "end": range_match[1]}
            return True
        elif re.match(r'(?:((?<![\d])[\d]{1,2})(?:-|_|\.|\/)([\d]+)|([\d]+)(?:-|_|\.|\/)([\d]{1,2})(?![\d]))', value):
            range_match = re.findall(r'([\d]+(?:-|_|\.|\/)[\d]+)', value)
        elif re.match(r'[a-zA-Z\.]+ [\d,]+ [\d]+', value):
            range_match = re.findall(r'([a-zA-Z\.]+ [\d,]+ [\d]+)', value)
        elif re.match(r'[\d]+ [a-zA-Z\.]+ [\d]+', value):
            range_match = re.findall(r'([\d]+ [a-zA-Z\.]+ [\d]+)', value)
        elif re.match(r'[\d]{4}', value):
            range_match = [value, value] # Force pyparsing to handle any YYYY or YYYY-YYYY values as dateutil mangles them

        else:
            return False
        d = None
        if len(range_match) == 1:
            try:
                dx = dateutil.parser.parse(value)
            except ValueError:
                return False
            self.parsed_date = {"start": str(dx), "end": str(dx)}
        else:
            try:
                d = parse(value, allow_implicit=True)
            except ParseException as e:
                return False
            if d is None:
                #return ["Invalid date"]
                return False
            self.parsed_date = {"start": str(d[0]), "end": str(d[1])}

        self.value_parsed = value
        print self.parsed_date
        return True
Example #9
0
def get_calendar_table(url, fixfun=None):
    """
    Scraps calendar info from the given url.  You can pass a function that
    fixes bad date ranges.
    """

    if fixfun is None:
        fixfun = lambda rg: rg

    res = requests.get(url)
    res.raise_for_status()
    calendar_page = bs4.BeautifulSoup(res.text, "lxml")

    head, body = extract_table_parts(calendar_page)

    # Parse the table into dict of dicts, one for each semester:
    colnames = [col.getText() for col in head.find_all("th")]
    colcnt = len(colnames)

    semesters = {s: {} for s in colnames[1:]}

    for row in body.find_all("tr"):
        for i, cell in enumerate(row.find_all("td")):
            if i == 0:
                #key = unicodedata.normalize("NFKD",cell.getText())
                key = unidecode(cell.getText())
            elif i < colcnt:
                #dates = unicodedata.normalize("NFKD",cell.getText()).strip(" ")
                # Strip stupid comments they sometimes put in there
                stupid = cell.find("strong")
                if stupid:
                    _ = stupid.extract()
                dates = unidecode(cell.getText()).strip(whitespace)
                if dates != "":
                    daterange = parse(dates)
                    semesters[colnames[i]][key] = fixfun(daterange)

    return semesters
Example #10
0
    try:
        c = soup.find_all("div", {"class": "financial_data"})[0].find_all(
            "div", {"class": "row"})[0].find_all(
                "div", {"class": "col_2 expand"})[0].find_all()[2].get_text()
    except:
        pass

    if a == 'Time': Time = b + '  ' + c

    print(b, c)
    try:
        test = ' - '.join([
            dt.strptime(i, '%Y-%m-%d').strftime('%d %b %Y')
            for i in c.split(' - ')
        ])
        s, e = parse(test)
        start = s.strftime('%Y-%m-%d')
        end = e.strftime('%Y-%m-%d')
    except:
        pass
    print(start, end)

    #financials

    financials = soup.find_all("div", {"class": "financial_data"})[0].find_all(
        "div", {"class": "data_row"})

    #for row in financials:
    #   row.find_all("div",{"class":"col_2"})[0].prettify()
    #   row.find_all("div",{"class":"col_2"})[1].prettify()
def scrape_link_details(driver, link):
    """Opens a link to a listing and scrapes all of the pertinent details. Returns 1) the number of sales made by the shop, 2) the number of this item currently in people's baskets, 3) the description of the item, 4) the average number of days between today and when the item arrives, 5) the cost of delivery, 6) whether returns are accepted, 7) the country where the item is dispatched from, and 8) how many images the listing has.   
    """
    for i in range(3):
        try:
            random_sleep_link = random.uniform(5, 7)
            time.sleep(random_sleep_link)
            windows_before = driver.current_window_handle
            driver.execute_script("window.open('" + link + "');")
            print('opened window')
            windows_after = driver.window_handles
            new_window = [x for x in windows_after if x != windows_before][0]
            print('got new deets')
            driver.switch_to.window(new_window)
            loaded = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.ID, "gnav-search")))

            try:
                sales = loaded.find_elements_by_xpath(
                    "//div[starts-with(@class, 'wt-display-inline-flex-xs wt-align-items-center')]/a/span[1]"
                )
                s = sales[0].text
                num_sales = s.split(" ")[0]
            except:
                num_sales = 0

            try:
                basket = loaded.find_elements_by_xpath(
                    "//p[@class='wt-position-relative wt-text-caption']")
                x = basket[0].text
                y = [int(i) for i in x.split() if i.isdigit()]
                for i in y:
                    num_basket = i
            except:
                num_basket = 0

            try:
                description = loaded.find_element_by_xpath(
                    "//meta[@name='description']")
                descriptions = description.get_attribute("content")
            except:
                descriptions = np.nan

            try:
                arrival = loaded.find_element_by_xpath(
                    "//*[@id='shipping-variant-div']/div/div[2]/div[1]/div/div[1]/p"
                )
                arrival_range = arrival.text
                start, end = parse(arrival_range)
                average = start + (end - start) / 2
                today = datetime.date.today()
                diff = average.date() - today
                days_to_arrival = diff.days
            except:
                days_to_arrival = np.nan

            try:
                delivery = loaded.find_element_by_xpath(
                    "//*[contains(text(), 'Cost to deliver')]/following-sibling::p"
                ).text
                if delivery == 'Free':
                    cost_delivery = 0
                else:
                    match = re.search(r'\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})',
                                      delivery).group(0)
                    cost_delivery = float(match)
            except:
                cost_delivery = np.nan

            try:
                loaded.find_element_by_xpath(
                    "//*[contains(text(), 'Accepted')]")
                returns_accepted = 1
            except:
                returns_accepted = 0

            try:
                dispatch = loaded.find_element_by_xpath(
                    "//*[@id='shipping-variant-div']/div/div[2]/div[7]").text
                d_split = dispatch.split(" ")[2:]
                d_join = " ".join(d_split)
                dispatch_from = d_join
            except:
                dispatch_from = np.nan

            try:
                images = loaded.find_element_by_xpath(
                    "//ul[starts-with(@class, 'wt-list-unstyled wt-display-flex-xs')]"
                )
                i_list = images.find_elements_by_xpath(
                    "//li[@class='wt-mr-xs-1 wt-mb-xs-1 wt-bg-gray wt-flex-shrink-xs-0 wt-rounded carousel-pagination-item-v2']"
                )
                count_images = len(i_list)
            except:
                count_images = 1
            driver.close()  # close the window
            driver.switch_to.window(
                windows_before)  # switch_to the parent_window_handle
            print('switched')

        except requests.exceptions.RequestException:  #if anything weird happens...#
            random_sleep_except = random.uniform(240, 360)
            print("I've encountered an error! I'll pause for" +
                  str(random_sleep_except / 60) + " minutes and try again \n")
            time.sleep(
                random_sleep_except)  #sleep the script for x seconds and....#
            continue  #...start the loop again from the beginning#

        else:  #if the try-part works...#
            break  #...break out of the loop#
            print('broke out of the loop')

    else:  #if x amount of retries on the try-part don't work...#
        raise Exception("Something really went wrong here... I'm sorry."
                        )  #...raise an exception and stop the script#

    return num_sales, num_basket, descriptions, days_to_arrival, cost_delivery, returns_accepted, dispatch_from, count_images
Example #12
0
def subpage_data(url):
    price = '-'
    goal = '-'
    date = ''
    token = '-'
    start = ''
    end = ''
    html = requests.get(url).content
    #print(html)
    soup = BeautifulSoup(html, "lxml")
    #print(soup)
    try:
        pr = soup.findAll('div',
                          class_='col-12 col-md-6')[0].findAll('li')[2].text
        if 'ICO Token Price:' in pr:
            splitted = pr.split()
            price = splitted[6]
        else:
            price = '-'
    except:
        price = '-'

    try:
        text = soup.find('div', class_='ico-right-col')
        goal = clean(
            text.find('div', class_="goal").text.split("(")[0].strip())
    except:
        goal = '-'

    try:
        tk = soup.findAll('div',
                          class_='col-12 col-md-6')[0].findAll('li')[0].text
        if 'Ticker:' in tk:
            splitted = tk.split()
            token = splitted[1]
        else:
            token = '-'
    except:
        token = '-'

    try:
        text = soup.find_all("div", {"class": "rating-item"})[0]
        hype = text.find('p', class_="rate").text.strip()
    except:
        hype = '-'

    try:
        text = soup.find_all("div", {"class": "rating-item"})[1]
        risk = text.find('p', class_="rate").text.strip()
    except:
        risk = '-'

    try:
        text = soup.find_all("div", {"class": "rating-item"})[2]
        roi = text.find('p', class_="rate").text.strip()
    except:
        roi = '-'

    try:
        text = soup.find('div', class_='rating-result')
        icorate = text.find('p', class_="ico-rate").text.strip()

    except:
        icorate = '-'

    try:
        text = soup.find("div", {"class": "button"}).parent['href']
        spltAr = text.split("://")
        spltAr = re.sub(r'www/.', '', spltAr)
        i = (0, 1)[len(spltAr) > 1]
        domain = spltAr[i].split("?")[0].split('/')[0].split(':')[0].lower()

    except:
        domain = '-'

    try:
        text = soup.find_all(
            "div",
            {"class": "col-12 title-h4"})[0].findAll('h4')[0].text.strip()
        text2 = re.sub(r'[\t\n\r]*', '', text)
        text3 = re.sub(r'Token Sale: ', '', text2)
        text4 = re.sub(r'\(.*\)', '', text3)
        text5 = re.sub(r'since', '', text4)
        text6 = re.sub(r'Market & Returns', '', text5)
        date = re.sub(r'period isn\'t set', '', text6)

    except:
        date = ''

    try:
        s, e = parse(date)
        start = s.strftime('%Y-%m-%d')
        end = e.strftime('%Y-%m-%d')
    except:
        pass

    return [
        price, goal, date, start, end, token, hype, risk, roi, icorate, domain
    ]
Example #13
0
                        'td')).find_all('td')[0].text.encode('utf-8').strip()
                '''if b'date:' in th.lower():
					date = parsed.replace(b'\n',b'').replace(b'\r',b'').decode('utf-8')
				elif b'time:' in th.lower():
					time = parsed.replace(b'\n',b'').replace(b'\r',b'').decode('utf-8')
				el'''
                if b'location:' in th.lower():
                    location = parsed.replace(b'\n',
                                              b'').replace(b'\r',
                                                           b'').decode('utf-8')
                elif b'\xc2' in th.lower():
                    description = parsed.decode('utf-8')
            if any(s in description.lower() for s in strings):
                print(title + " on " + date + " at " + time + " at " +
                      location)
                print(description)
                print(str(date + " at " + time))
                start, end = parse(str(date + " at " + time))
                print("Start = " + start)
                print("End = " + end)
                data = {
                    "Date": date,
                    "Time": time,
                    "Description": description,
                    "Location": location,
                    "Title": title
                }

                #uncomment to push to db
                #db.child("events").push(data);