Example #1
0
    def parse(self):
        self.has_metadata = bool(self.metadata["schemata"])
        self.metadata_count = len(self.metadata["schemata"])
        self.visited = bool(self.html)
        for k, v in parse_schemata(self.__dict__).items():
            setattr(self, k, v)
        try:
            tree = parse_html(self.html, self.base_url)

            def find_one(selector):
                try:
                    return format_text(tree.xpath(selector)[0].text_content())
                except:
                    return ""

            if not self.headline:
                self.headline = find_one("//h1")
            if not self.articlebody:
                self.articlebody = "\n".join([
                    format_text(node.text_content())
                    for node in tree.xpath("//p")
                ])
            print(self.articlebody)
        except Exception as e:
            print(e)
        # self.html = ""
        return self.__dict__
Example #2
0
    async def extract_schema_objects(self, responses):
        """Iterate through a collection of HTTP response objects, extract any
           embedded json objects from the DOM (possibly an empty list), load those
           data structures into memory, and append them to the response."""
        for response in responses:
            # try:
            html = response.html
            tree = parse_html(html)
            schemata = tree.xpath("//script[contains(@type, 'json')]/text()")
            jsonized = []
            errors = []
            for schema in schemata:
                try:
                    jsonized.append(json.loads(schema))
                except Exception as e:
                    serialized = [f"{e.__class__.__name__} :: {e}", schema]
                    errors.append(serialized)

            response.metadata = {"schemata": jsonized, "errors": errors}
            response.has_metadata = bool(jsonized)
            response.metadata_count = len(jsonized)

        # except Exception as e:
        #     print(e.__class__.__name__, e, response)
        #     response['metadata'] = {"schemata": [], "errors": }
        return responses
Example #3
0
def get_url_title(url):
    r""" Request HTML for the page at the URL indicated and return it's <title> property

    >>> get_url_title('mozilla.com').strip()
    'Internet for people, not profit\n    — Mozilla'
    """
    parsed_url = try_parse_url(url)
    if parsed_url is None:
        return None
    try:
        r = requests.get(parsed_url.geturl(),
                         stream=False,
                         allow_redirects=True,
                         timeout=5)
        tree = parse_html(r.content)
        title = tree.findtext('.//title')
        return title
    except ConnectionError:
        logging.error(
            'Unable to connect to internet to retrieve URL {}'.format(
                parsed_url.geturl()))
        logging.error(format_exc())
    except (InvalidURL, InvalidSchema, InvalidHeader, MissingSchema):
        logging.warn('Unable to retrieve URL {}'.format(parsed_url.geturl()))
        logging.error(format_exc())
Example #4
0
def get_sensor_status():
    """
    Parses PDU status HTML and returns sensor readings.
    """
    url = '/sensors.html'
    res = dispatch_request(url)
    if res[0] != 200:
        raise Exception('Failed to get status')
    data = res[1]
    data = clean_html(data)
    tree = parse_html(data)
    id1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[2]/font')
    id2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[2]/font')
    lab1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[3]/font/b')
    lab2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[3]/font/b')
    temp1 = parse_value(tree,
                       '/html/body/div/div/table[2]/tr[5]/td[4]/font/b/font/b')
    temp2 = parse_value(tree,
                       '/html/body/div/div/table[2]/tr[6]/td[4]/font/b/font/b')
    hum1 = parse_value(tree,
                       '/html/body/div/div/table[2]/tr[5]/td[5]/font/b/font/b')
    hum2 = parse_value(tree,
                       '/html/body/div/div/table[2]/tr[6]/td[5]/font/b/font/b')
    hum1 = hum1.replace(' %', '')
    hum2 = hum2.replace(' %', '')
    temp1 = temp1.replace(' Deg. F', '')
    temp2 = temp2.replace(' Deg. F', '')
    res = [{'id': id1, 'label': lab1, 'temp': temp1, 'hum': hum1},
           {'id': id2, 'label': lab2, 'temp': temp2, 'hum': hum2}, ]
    return res
Example #5
0
def summary(response):
    logger.info('Processing summary from %s' % response.url)
    html = parse_html(response.content.replace(b'&nbsp;', b''))
    html.make_links_absolute(response.url)

    titles = html.xpath('.//strong/a/text()')
    if len(titles) == 1:
        title = str(titles[0])
    else:
        title = ''
        logger.warning('Found no title in %s' % response.url)

    bodies = html.xpath('.//div[@class="da_black"]')
    if len(bodies) == 1:
        body = bodies[0].text_content()
    else:
        body = ''
        logger.warning('Found no body in %s' % response.url)

    def xpath(query):
        xs = html.xpath(query)
        if len(xs) == 1:
            return xs[0]
        else:
            logger.warning('Found %d results for "%s", skipping' % (len(xs), query))
            return ''

    record = {
        'article_id': subparsers.article_id(response.url),
    #   'url': response.url,
        'post_date': subparsers.date(xpath('.//em[contains(text(), "Posted:")]/text()')),
        'expiration_date': subparsers.date(xpath('.//em[contains(text(), "Expiration date:")]/text()')),
    #   'title': title,

        'applicant': subparsers.applicant(body),
        'linear_feet': subparsers.linear_feet(body),
        'county': subparsers.county(body),

        'body': subparsers._strip_html_ws(body),
        'attachments': subparsers.attachments(html),
        'hydrologic_unit_codes': subparsers.hucs(body),
        'coastal_use_permits': subparsers.cups(body),
        'water_quality_certifications': subparsers.wqcs(body),
    }
    record.update(subparsers.permit_manager(body))

    maybe_pan = da_number(title)
    if maybe_pan:
        record.update(maybe_pan)

    a = subparsers.applicant(body)
    if a:
        record['applicant'] = a

    fallbacks = subparsers.soup(body)
    record['longitude'] = record['latitude'] = None
    for k in fallbacks:
        if not record[k]:
            record[k] = fallbacks[k]
    return record
def scrape_row(session, row):
    profile_url = urljoin(base_url, row.xpath('.//a/@href')[0])
    constituency, island, group = (None,) * 3
    if 'Parliamentary Secretaries' not in row.xpath('string(//title)'):
        constituency, island, group = ((*i.xpath('./text()'), '')[0].strip()
                                       for i in row.xpath('./td[position() > 1]'))
    name, = row.xpath('.//a/text()')
    last, first = (i.strip()
                   for i in ft.reduce(lambda s, r: s.replace(r, ''),
                                      honorifics, name).split(','))

    session.visit(profile_url)
    html = parse_html(session.html)
    image, = html.xpath('//img[@class = "alignLeft sidePicture"]/@src')
    image = urljoin(base_url, image)
    return (first + ' ' + last,
            last + ', ' + first,
            last,
            first,
            extract_birth_date(html.xpath('//div[text() = "Biography"]'
                                          '/following-sibling::p')),
            image,
            group and group.strip('()'),
            constituency,
            island,
            profile_url)
Example #7
0
 def run(self):
     with open(self.outfile, "w+", encoding="utf-8") as file:
         for link in self.links:
             try:
                 content = parse_html(self.get_article(link)).find(
                     ".//div[@id='MainContent']")
                 breadcrumbs = content.findall(
                     ".//div[@id='BreadCrumb']/div/a")
                 if len(breadcrumbs) == 0:
                     breadcrumbs = content.findall(
                         ".//div[@class='ThebreadCrumbContainer']//a")
                 categories = [
                     a.text.strip() for a in breadcrumbs[:self.categories]
                 ]
                 title = content.find(
                     ".//div[@class='Details_MainTitle']").text.strip()
                 body = content.find(".//div[@id='detailedBody']")
                 if body is None:
                     body = content.find(
                         ".//div[@class='DetailsArticleSummary']")
                 body = self.whitespaces.sub(" ",
                                             body.text_content()).strip()
                 file.write(
                     "\t".join([link[37:], *categories, title, body]) +
                     "\n")
                 print("Added Article:", title)
             except Exception as error:
                 print("In link", link, "Error", error)
Example #8
0
def load_youtube(ytid):
    tmp_url = YT_URL + ytid
    tmp_title = parse_html(urlopen(tmp_url)).find('.//title').text
    tmp_submitter_ip = request.remote_addr
    tmp_submitter = gethostbyaddr(tmp_submitter_ip)[0]

    if tmp_title == 'YouTube':
        return 'Invalid youtube id {}'.format(ytid)

    if tmp_submitter_ip not in song_queues:
        q = Queue()
        song_queues[tmp_submitter_ip] = q
    else:
        q = song_queues[tmp_submitter_ip]

    tmp_record = RecordType(title=tmp_title,
                            url=tmp_url,
                            submitter_host=tmp_submitter,
                            submitter_ip=tmp_submitter_ip)
    q.put(tmp_record)
    elements = q.qsize()

    dump_queue()
    return '{}<br/>{}<br/>Queue size:{}'.format(tmp_title, tmp_submitter,
                                                elements)
def gather_people(session):
    session.visit(base_url)
    while True:
        yield from iter(parse_html(session.html)
                        .xpath('//table[@class = "detailTable detailTable_full"]/tbody/tr'))
        next_page = session.find_by_xpath('//a[@title = "Link to next page"]')
        if not next_page:
            break
        next_page.click()
def main():
    global base_url, page_num_of_article, page_start, page_step, page_encoding, page_sleep, count

    count = 0

    last_page      = scraperwiki.sqlite.get_var('last_page', -1)
    latest_article = None
    start_over     = False
    if last_page == -1:
        last_page      = page_start
        latest_article = scraperwiki.sqlite.get_var('latest_article', None)
        start_over     = True

    opener = urllib2.build_opener()
    opener.addheaders = [
        ('User-agent', 'Mozilla/5.0'),
        ('Referer', base_url)
    ]
    urllib2.install_opener(opener)

    error_count    = 0
    num_of_article = page_num_of_article
    while num_of_article == page_num_of_article:
        page_url = build_url(last_page)

        try:
            html = scraperwiki.scrape(page_url)
        except urllib2.URLError, e:
            print 'Cannot reach the server:',
            if hasattr(e, 'reason'): print e.reason
            elif hasattr(e, 'code'): print e.code
            error_count += 1
            if error_count < 3: continue
            else: break

        try:
            html = html.decode(page_encoding)
        except UnicodeDecodeError:
            encoded = ''
            for word in html.split(' '):
                try:
                    encoded += word.decode(page_encoding) + ' '
                except UnicodeDecodeError:
                    pass
            html = encoded.rstrip()

        num_of_article = scrape(parse_html(html), latest_article, start_over)

        page = last_page / page_step
        if (page_start == 0): page += 1

        scraperwiki.sqlite.save_var('last_page', last_page)
        print 'Page', page, ',', num_of_article, 'article(s)'

        last_page += page_step
        if not page_exists(html, last_page): break
        time.sleep(page_sleep)
Example #11
0
def get_links_for_date(date):
    """Function to retrieve the image links for a given date."""
    comic_url = f"http://www.girlgeniusonline.com/comic.php?date={date}"
    try:
        resp = urlopen(comic_url)
        if (resp.status == 200):
            doc = parse_html(resp.read())
            return doc.xpath('//img[@src and @alt="Comic"]/@src')
    except (TimeoutError, URLError):
        print(f"Error getting comic for {date}")
def main():
    with urlopen(Request(base_url, headers={'User-Agent': 'Mozilla/5.0'})) as r, \
            sqlite3.connect('data.sqlite') as c:
        c.execute('''\
CREATE TABLE IF NOT EXISTS data
(name, sort_name, family_name, given_name, gender, term, area,
 UNIQUE (name, term, area))''')
        c.executemany('''\
INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?, ?, ?)''',
            gather_people(parse_html(r.read().decode())))
def gather_people(session):
    session.visit(base_url)
    while True:
        yield from iter(
            parse_html(session.html).xpath(
                '//table[@class = "detailTable detailTable_full"]/tbody/tr'))
        next_page = session.find_by_xpath('//a[@title = "Link to next page"]')
        if not next_page:
            break
        next_page.click()
Example #14
0
 def get_extensions(self, with_plain_descriptions=True):
     extensions = json.loads(
         self.make_request('/extensions/').read()
     )['extensions']
     if with_plain_descriptions:
         for extension in extensions:
             extension['description'] = fix_whitespace(parse_html(
                 extension['description']
             ).text_content()).strip()
     return extensions
def collect_rows(session):
    for list_url in list_urls:
        session.visit(urljoin(base_url, list_url))
        while True:
            yield from iter(parse_html(session.html)
                            .xpath('//table[@class = "detailTable detailTable_full"]/tbody/tr'))

            next_page = session.find_by_xpath('//a[@title = "Link to next page"]')
            if not next_page:
                break
            next_page.click()
Example #16
0
def scrape_courses(response, from_list_of=False):
    from lxml.html import fromstring as parse_html
    # Determine scraped attribute and location of course id
    # depending on whether the sent data is from list_of() or courses() functions
    attr, start, end = ("href", 54, -7) if from_list_of else ("onclick", 87,
                                                              -24)
    # Return dictionary of courses blackboard ids mapped to courses myUDC ids
    return {
        link.text[:7]: link.attrib[attr][start:end]
        for link in parse_html(response).xpath("//a")
    }
def main():
    global base_url, page_num_of_article, page_start, page_step, page_encoding, page_sleep, count

    count = 0

    last_page = scraperwiki.sqlite.get_var('last_page', -1)
    latest_article = None
    start_over = False
    if last_page == -1:
        last_page = page_start
        latest_article = scraperwiki.sqlite.get_var('latest_article', None)
        start_over = True

    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0'), ('Referer', base_url)]
    urllib2.install_opener(opener)

    error_count = 0
    num_of_article = page_num_of_article
    while num_of_article == page_num_of_article:
        page_url = build_url(last_page)

        try:
            html = scraperwiki.scrape(page_url)
        except urllib2.URLError, e:
            print 'Cannot reach the server:',
            if hasattr(e, 'reason'): print e.reason
            elif hasattr(e, 'code'): print e.code
            error_count += 1
            if error_count < 3: continue
            else: break

        try:
            html = html.decode(page_encoding)
        except UnicodeDecodeError:
            encoded = ''
            for word in html.split(' '):
                try:
                    encoded += word.decode(page_encoding) + ' '
                except UnicodeDecodeError:
                    pass
            html = encoded.rstrip()

        num_of_article = scrape(parse_html(html), latest_article, start_over)

        page = last_page / page_step
        if (page_start == 0): page += 1

        scraperwiki.sqlite.save_var('last_page', last_page)
        print 'Page', page, ',', num_of_article, 'article(s)'

        last_page += page_step
        if not page_exists(html, last_page): break
        time.sleep(page_sleep)
Example #18
0
def main():
    with urlopen(Request(base_url, headers={'User-Agent': 'Mozilla/5.0'})) as r, \
            sqlite3.connect('data.sqlite') as c:
        c.execute('''\
CREATE TABLE IF NOT EXISTS data
(name, sort_name, family_name, given_name, gender, term, area,
 UNIQUE (name, term, area))''')
        c.executemany(
            '''\
INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?, ?, ?)''',
            gather_people(parse_html(r.read().decode())))
Example #19
0
def map_error_response(solr_response):
    if 'response' in solr_response and solr_response['response'].code >= 400:
        real_response = solr_response['response']

        document = parse_html(real_response.body)
        title = tostring(document.xpath('//title').pop(), method='text')
        reason = title.strip()
        body_element = document.xpath('//body').pop()
        raw_body = tostring(body_element, method='text').strip()
        original_message = re.sub(r'(\s+)|(Powered.*$)', ' ', raw_body).strip()
        return {'reason':reason, 'original_message': original_message,
                'response': real_response}

    else:
        return solr_response
Example #20
0
def get_metadata_from_schol_html(path):
    if (path / 'results.json').exists():
        return

    doc = parse_html(str(path / 'scholarly.html'))

    metadata = {}

    metadata['title'] = {'value': [all_whitespace_to_space(get_text_from_selector(doc, '.title-group')[0])]}
    metadata['doi'] = {'value': [all_whitespace_to_space(get_text_from_selector(doc, '.doi')[0]).replace('doi: ', '')]}
    metadata['date'] = {'value': [all_whitespace_to_space(get_text_from_selector(doc, '.pub-date-epub')[0]).replace('epub: ', '')]}
    metadata['journal'] = {'value': [all_whitespace_to_space(get_text_from_selector(doc, '.journal-title')[0])]}

    with open(str(path / 'results.json'), 'w') as f:
        json.dump(metadata, f, indent=4)
Example #21
0
def get_all_uses_of_citation(fname_or_etree, doi="", title="", n_sentences=0):
    #print("Looking for %s in %s" % (doi, fname_or_etree))
    if type(fname_or_etree) is not lxml.etree._ElementTree:
        html = parse_html(fname_or_etree)
    else:
        html = fname_or_etree

    # Try searching by div first
    if doi != "":
        doi_element = get_doi_element(html, doi)
        if doi_element is None:
            div = None
        else:
            div = doi_element.getparent()

    # But we couldn't find the DOI then try the title
    if div is None and title != "":
        title_element = get_title_element(html, title)

        if title_element is None:
            div = None
        else:
            div = title_element.getparent()

    if div is None:
        return
    #print(all_whitespace_to_space(div.text_content()))

    li = div.getparent()
    ref_id = li.find('a').attrib['name']

    #print(ref_id)

    sel = CSSSelector('a[href="#%s"]' % ref_id)
    res = sel(html)
    #print(res)
    text = [get_sentence(r, n_around=n_sentences) for r in res]

    if len(text) == 0:
        # It is the in the list of references, but we can't find the citation
        # This is probably because it was something like reference number 4
        # and was cited as [2-5]
        # So we return some text that explains the error
        text = [
            'ERROR: In reference list, but cannot find citation. Check manually.'
        ]

    return text
Example #22
0
def get_all_uses_of_citation(fname_or_etree, doi="", title="", n_sentences=0):
    #print("Looking for %s in %s" % (doi, fname_or_etree))
    if type(fname_or_etree) is not lxml.etree._ElementTree:
        html = parse_html(fname_or_etree)
    else:
        html = fname_or_etree

    # Try searching by div first
    if doi != "":
        doi_element = get_doi_element(html, doi)
        if doi_element is None:
            div = None
        else:
            div = doi_element.getparent()

    # But we couldn't find the DOI then try the title
    if div is None and title != "":
        title_element = get_title_element(html, title)

        if title_element is None:
            div = None
        else:
            div = title_element.getparent()

    if div is None:
        return
    #print(all_whitespace_to_space(div.text_content()))

    li = div.getparent()
    ref_id = li.find('a').attrib['name']

    #print(ref_id)

    sel = CSSSelector('a[href="#%s"]' % ref_id)
    res = sel(html)
    #print(res)
    text = [get_sentence(r, n_around=n_sentences) for r in res]

    if len(text) == 0:
        # It is the in the list of references, but we can't find the citation
        # This is probably because it was something like reference number 4
        # and was cited as [2-5]
        # So we return some text that explains the error
        text = ['ERROR: In reference list, but cannot find citation. Check manually.']

    return text
def scrape_rows(session, rows):
    for row in rows:
        profile_link = urllib.parse.urljoin(base_url,
                                            row.xpath('.//a/@href')[0])
        constituency, island, group = ([
            *i.xpath('./text()'), ''
        ][0].strip() for i in row.xpath('./td[position() > 1]'))
        name, = row.xpath('.//a/text()')
        last, first = (i.strip() for i in ft.reduce(
            lambda s, r: s.replace(r, ''), honorifics, name).split(','))

        session.visit(profile_link)
        image, = parse_html(session.html)\
            .xpath('//img[@class = "alignLeft sidePicture"]/@src')
        image = urllib.parse.urljoin(base_url, image)
        yield (first + ' ' + last, last + ', ' + first, last, first, image,
               group.strip('()'), constituency, island, profile_link)
def extract_birth_date(text):
    try:
        text = next(p.text_content() for p in text
                    if 'born' in p.text_content())
        # We're arbitrarily limiting it to eight tokens after 'born' so that
        # we don't accidentally pick up dates other than birth dates
        text = ' '.join(text[text.find('born'):].split()[:10])
    except StopIteration:
        return
    with urlopen('http://nlp.stanford.edu:8080/sutime/process',
                 data=urlencode({'q': text, 'rules': 'english'}).encode()) as r:
        date, = parse_html(r.read())\
            .xpath('//h3[text() = "Temporal Expressions"]'
                   '/following-sibling::table[1]//tr[2]/td[2]/text()') or (None,)
        if not date:
            print('Unable to extract birth date from {!r}'.format(text),
                  file=sys.stderr)
        return date
Example #25
0
def get_outlet_status():
    """
    Parses PDU status HTML and returns outlet statuses.
    """
    url = '/outctrl.html'
    res = dispatch_request(url)
    if res[0] != 200:
        raise Exception('Failed to get status')
    data = res[1]
    data = clean_html(data)
    tree = parse_html(data)
    id1 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[2]/font')
    id2 = parse_value(tree, '/html/body/div/div/table[2]/tr[7]/td[2]/font')
    lab1 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[3]/font/b')
    lab2 = parse_value(tree, '/html/body/div/div/table[2]/tr[7]/td[3]/font/b')
    stat1 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[5]/font')
    stat2 = parse_value(tree, '/html/body/div/div/table[2]/tr[7]/td[5]/font')
    return [{'id': id1, 'label': lab1, 'status': stat1},
            {'id': id2, 'label': lab2, 'status': stat2}, ]
Example #26
0
    def __init__(self,
                 url,
                 html,
                 row,
                 soup=None,
                 lxml=None,
                 fix_encoding_errors=True):
        self.url = url
        self.sitemap_data = row
        self.html = (fix_text_segment(html.replace("\xa0", " "),
                                      uncurl_quotes=False)
                     if fix_encoding_errors else html)
        try:

            self.soup = soup if soup else BeautifulSoup(self.html)
        except Exception as e:
            raise ValueError(f"{e.__class__.__name__} :: {e}, {self.html}")
        self.meta = Haystack(html)
        #print(json.dumps(self.meta, indent=4))
        try:
            if isinstance(self.html, str):
                self.html = self.html.encode("utf-8")
            self.lxml = lxml if lxml else parse_html(self.html)
        except Exception as e:
            raise ValueError(f"{e.__class__.__name__} :: {e}, {self.html}")
        self.data = {
            "content": self.content,
            "url": self.url,
            "title": self.title,
            "published_at": self.published_at,
            "description": self.summary,
            "author": self.author,
            "image_url": self.image_url,
            "section": self.section,
            "publisher": self.publisher,
            "keywords": self.keywords,
            "metadata": {k: v
                         for k, v in self.meta.data.items()},
        }
        self.data.update(
            {k: row[k]
             for k in self.passthrough_attrs if row and k in row})
Example #27
0
def extract_examples(docs_html_filepaths, output_directory):
    for docs_html_filepath in docs_html_filepaths:
        with open(docs_html_filepath, 'r') as docs_html_file:
            dom = parse_html(docs_html_file)
        log('Extracting from', docs_html_filepath)

        examples = dom.xpath('//*[@data-example-id]')
        for example in examples:
            example_id = example.get('data-example-id')
            if not example_id:
                continue

            example_filename = example_id + '.html'
            example_filepath = joinpath(output_directory, example_filename)

            example_html = EXAMPLE_TEMPLATE.format(example_html=etree.tostring(example))

            with open(example_filepath, 'w') as example_html_file:
                example_html_file.write(example_html)
                log('Wrote', example_filepath)
Example #28
0
def main():
    with urlopen(base_url) as r:
        src = r.read().decode('windows-1253')

    now = dt.datetime.now().isoformat()
    con = sqlite3.connect('data.sqlite')
    with con:
        con.execute('''\
CREATE TABLE IF NOT EXISTS first_reading_archive
(src, time_last_scraped, UNIQUE (src))''')
        con.execute('''\
INSERT OR REPLACE INTO first_reading_archive VALUES (?, ?)''', (src, now))
    with con:
        con.execute('''\
CREATE TABLE IF NOT EXISTS first_reading
(number, title, sponsors, committees, date_tabled, time_last_scraped,
 UNIQUE (number, title, date_tabled))''')
        con.executemany('''\
INSERT OR REPLACE INTO first_reading VALUES (?, ?, ?, ?, ?, ?)''',
            gather_docs(parse_html(src), now))
def scrape_rows(session, rows):
    for row in rows:
        profile_link = urllib.parse.urljoin(base_url,
                                            row.xpath('.//a/@href')[0])
        constituency, island, group = ([*i.xpath('./text()'), ''][0].strip()
                                       for i in row.xpath('./td[position() > 1]'))
        name, = row.xpath('.//a/text()')
        last, first = (i.strip()
                       for i in ft.reduce(lambda s, r: s.replace(r, ''),
                                          honorifics, name).split(','))

        session.visit(profile_link)
        image, = parse_html(session.html)\
            .xpath('//img[@class = "alignLeft sidePicture"]/@src')
        image = urllib.parse.urljoin(base_url, image)
        yield (first + ' ' + last,
               last + ', ' + first,
               last,
               first,
               image,
               group.strip('()'),
               constituency,
               island,
               profile_link)
def main():
    global page_num_of_article, page_start, page_step, page_encoding, count

    count = 0

    last_page      = scraperwiki.sqlite.get_var('last_page', page_start)
    latest_article = ''
    if last_page == -1:
        last_page    = page_start
        latest_article = scraperwiki.sqlite.get_var('latest_article', '')

    num_of_article = page_num_of_article
    while num_of_article == page_num_of_article:
        page_url       = build_url(last_page)
        html           = scraperwiki.scrape(page_url).decode(page_encoding)
        num_of_article = scrape(parse_html(html), latest_article)

        scraperwiki.sqlite.save_var('last_page', last_page)
        last_page += page_step

        if not page_exists(html, last_page): break

    scraperwiki.sqlite.save_var('last_page', -1)
    print '%d article(s) have been scraped.' % count
Example #31
0
def summary(response):
    html = parse_html(response.content.replace(b'&nbsp;', b''))
    html.make_links_absolute(response.url)

    titles = html.xpath('//strong/a/text()')
    if len(titles) == 1:
        title = str(titles[0])
    else:
        title = ''
        logger.warning('Found no title in %s' % response.url)

    bodies = html.xpath('//div[@class="da_black"]')
    if len(bodies) == 1:
        body = bodies[0].text_content()
    else:
        body = ''
        logger.warning('Found no body in %s' % response.url)

    def xpath(query):
        xs = html.xpath(query)
        if len(xs) == 1:
            return xs[0]
        else:
            logger.warning('Found %d results for "%s", skipping' % (len(xs), query))
            return ''

    record = {
        'article_id': subparsers.article_id(response.url),
    #   'url': response.url,
        'post_date': subparsers.date(xpath('//em[contains(text(), "Posted:")]/text()')),
        'expiration_date': subparsers.date(xpath('//em[contains(text(), "Expiration date:")]/text()')),
    #   'title': title,
        'body': body.strip('\r\n '),
        'attachments': subparsers.attachments(html),
        'hydrologic_unit_codes': subparsers.hucs(body),
        'coastal_use_permits': subparsers.cups(body),
        'water_quality_certifications': subparsers.wqcs(body),
    }

    maybe_pan = da_number(title)
    if maybe_pan:
        record.update(maybe_pan)
        applicant, location, character, leftover = subparsers.body(html, url = response.url)
        record.update({
            'applicant': applicant.strip('\r\n '),
            'location': location.strip('\r\n '),
            'character': character.strip('\r\n '),
        })
    else:
        record.update({
            'applicant': '',
            'location': '',
            'character': '',
        })

    fallbacks = pdf.parse(body)
    record['longitude'] = record['latitude'] = None
    for k in fallbacks:
        if not record[k]:
            record[k] = fallbacks[k]
    return record
Example #32
0
 def get(self, path):
     """Return an pq instance of the lxml parsed document at path."""
     rv = self.client.get(path, follow_redirects=True)
     return parse_html(rv.data)
Example #33
0
 def get(self, path):
     """Return an pq instance of the lxml parsed document at path."""
     rv = self.client.get(path, follow_redirects=True)
     return parse_html(rv.data)
Example #34
0
def parse(html_text, today=None):
    html_tree = parse_html(html_text)
    return [
        parse_post(post_el, today=today)
        for post_el in html_tree.cssselect('.sprofile-post')
    ]
Example #35
0
    from lxml.html import parse as parse_html
    from lxml.html import tostring as tostring_html
    from urlparse import urlparse
    import sys

    url = sys.argv[1]
    url_obj = urlparse(url)
    base_url = url_obj.scheme + '://' + url_obj.hostname + '/' + ('/'.join(url_obj.path.split('/')[:-1]))
    target_filename = url_obj.path.split('/')[-2] + '.html'

    print 'Base URL:', base_url
    print 'TOC URL:', url

    data = {}

    toc_doc = parse_html(fetch_html(url))

    title_el = toc_doc.xpath('//div[@id="ct_title"]/h1')[0]
    data['title'] = title_el.text
    data['author'] = title_el.getchildren() and title_el.getchildren()[0].text or 'Anonymous'



    chapters = []
    data['chapters'] = chapters
    for el in toc_doc.xpath('//div[@id="catalog_list"]/ul/li/a'):
        ch_url = el.attrib.get('href')
        if ch_url.startswith('http://vip'):
            continue
        print 'Fetching', ch_url
        ch_data = {}
Example #36
0
def get_sections(curriculum_code):
    r = requests.get(BASE_URL.format(curriculum_code))
    r.raise_for_status()
    tree = parse_html(BytesIO(r.content))
    return list(map(build_section,
                    tree.xpath(TABLES_XPATH)[RELEVANT_SECTIONS]))
Example #37
0
async def detalhes(linha):
    linha_dash = linha if '-' in linha else linha + '-0'

    # Busca pelo numero da linha
    pag_query = await fetch_url(f'http://www.emdec.com.br/ABusInf/consultarlinha.asp?linha={linha_dash}&consulta=1')
    for line in pag_query.splitlines():
        pag_query_regex = '\\s*document.JnInformacoes.action = "detalhelinha.asp\\?(.*)";'
        match = re.match(pag_query_regex, line)
        if match:
            url_detalhes = f'http://www.emdec.com.br/ABusInf/detalhelinha.asp?{match.group(1)}'
            break

    pag_detalhes = parse_html(await fetch_url(url_detalhes))
    pag_map = await fetch_url(f'http://www.emdec.com.br/ABusInf/{pag_detalhes.cssselect("#mapFrame")[0].get("src")}')
    map_data = parseMap(pag_map)
    processed_map = await process_map(map_data)

    def schedules(dom):
        ret = {}
        for group in dom.xpath('div'):
            title_node = group.xpath('p')[-1]
            name = {
                u'Horários Sábado': 'saturday',
                u'Horários Sábado (Referência)': 'saturday',
                u'Horários Domingo': 'sunday',
                u'Horários Domingo (Referência)': 'sunday',
                u'Horários Útil': 'weekday',
                u'Horários Útil (Referência)': 'weekday',

            }[strip(title_node.text)]
            trips = []
            ret[name] = {
                'trips': trips,
                'vehicles': int(re.search('\\d+', title_node.tail).group())
            }
            #print(strip(group.xpath('p')[-1].tail))

            for cell in group.xpath('div/table/tr/td'):
                trips.append({
                    'time': strip(cell.xpath('table/tr/td')[0].text),
                    'wheelchair_accessible': bool(cell.xpath('table/tr/td/img'))
                })

        return ret


    def stops(dom):
        return [
            strip(td.text)
            for td in dom.cssselect('div > table > tr > td')
        ]

    def trecho(dom, map_data):
        details = {}
        for tr in dom.xpath('div/table/tr'):
            details[strip(tr.cssselect('td')[0].text)[:-1]] = tr.cssselect('td input')[0].get('value')

        main_panels = dom.xpath('table/tr/td')

        ret = {}
        ret["details"] = details
        #ret["end_location"] = geocode(details["Letreiro"])
        ret["schedules"] = schedules(main_panels[0])
        #ret["stops"] = stops(main_panels[1])
        ret["map"] = map_data
        return ret

    trechos = [
        trecho(div, processed_map[map_index])
        for (div, map_index) in zip(pag_detalhes.cssselect('#tabs > div'), [1,0])
    ]
    trechos = [trecho for trecho in trechos if trecho['map']['shape'] and trecho['details']['Letreiro'] != 'ESPECIAL']

    route_long_name = fix_route_name(linha, get_text(pag_detalhes, 'txtPesquisa').split(' - ', 1)[-1])

    ret = {}
    ret["route_short_name"] = linha
    ret["route_long_name"] = route_long_name
    ret["company"] = get_text(pag_detalhes, 'txtEmpresa')
    ret["comments"] = get_text(pag_detalhes, 'txtObservacao')
    ret["updated"] = strip(pag_detalhes.cssselect('#conteudo font[size="1"]')[0].text.split('\n')[1])
    ret["route_url"] = 'http://www.portalinterbuss.com.br/campinas/linhas/%s' % linha

    # http://www.portalinterbuss.com.br/campinas/layout-da-frota
    ret["route_color"] = {
        '1': '1985E9', # AZUL CLARO
        '2': 'e91919', # VERMELHO
        '3': '0D2447', # VERDE
        '4': '0D2447', # AZUL ESCURO
        '5': '9D9D9D', # BRANCO COM FIGURAS
    }[linha[0]]
    ret["route_text_color"] = {
        '1': 'C6E4FF', # AZUL CLARO
        '2': 'FFDDDD', # VERMELHO
        '3': 'E8FFDC', # VERDE
        '4': 'CFE2FF', # AZUL ESCURO
        '5': 'EFEFEF', # BRANCO COM FIGURAS
    }[linha[0]]
    ret["directions"] = trechos

    print(f"Fetched details from {linha}")
    return ret
Example #38
0
def summary(response):
    html = parse_html(response.content.replace(b'&nbsp;', b''))
    html.make_links_absolute(response.url)

    titles = html.xpath('//strong/a/text()')
    if len(titles) == 1:
        title = str(titles[0])
    else:
        title = ''
        logger.warning('Found no title in %s' % response.url)

    bodies = html.xpath('//div[@class="da_black"]')
    if len(bodies) == 1:
        body = bodies[0].text_content()
    else:
        body = ''
        logger.warning('Found no body in %s' % response.url)

    def xpath(query):
        xs = html.xpath(query)
        if len(xs) == 1:
            return xs[0]
        else:
            logger.warning('Found %d results for "%s", skipping' %
                           (len(xs), query))
            return ''

    record = {
        'article_id':
        subparsers.article_id(response.url),
        #   'url': response.url,
        'post_date':
        subparsers.date(xpath('//em[contains(text(), "Posted:")]/text()')),
        'expiration_date':
        subparsers.date(
            xpath('//em[contains(text(), "Expiration date:")]/text()')),
        #   'title': title,
        'body':
        body.strip('\r\n '),
        'attachments':
        subparsers.attachments(html),
        'hydrologic_unit_codes':
        subparsers.hucs(body),
        'coastal_use_permits':
        subparsers.cups(body),
        'water_quality_certifications':
        subparsers.wqcs(body),
    }

    maybe_pan = da_number(title)
    if maybe_pan:
        record.update(maybe_pan)
        applicant, location, character, leftover = subparsers.body(
            html, url=response.url)
        record.update({
            'applicant': applicant.strip('\r\n '),
            'location': location.strip('\r\n '),
            'character': character.strip('\r\n '),
        })
    else:
        record.update({
            'applicant': '',
            'location': '',
            'character': '',
        })

    fallbacks = pdf.parse(body)
    record['longitude'] = record['latitude'] = None
    for k in fallbacks:
        if not record[k]:
            record[k] = fallbacks[k]
    return record