Example #1
0
    def __init__(self, feed, cachename='cache'):
        self.error = False
        if type(feed) is list:
            self.rss_entries = list()
            for f in feed:
                p = feedparser.parse(f)
                if p.bozo:
                    self.error = True
                else:
                    self.rss_entries.append(p['entries'])
            # Interleave the feeds
            self.rss_entries = _interleave(self.rss_entries)
        else:
            p = feedparser.parse(feed)
            if p.bozo:
                self.error = True
            else:
                self.rss_entries = feedparser.parse(feed)['entries']

        if self.error:
            self.parser = None
            self.cache = None
            self.fetch = None
            self.rss_entries = None
            self.entries = None
            return

        self.parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser(
            indexNames=False)
        #self.parser.addIndexOnAttribute('property')
        self.cache = dict()
        self.cachename = cachename
        self.fetch = fetch.Fetcher()
        _cachedir = _config['cachedir']
        try:
            with open(f"{_cachedir}/{cachename}.cache", 'rb') as f:
                self.cache = pickle.load(f)
                for x in self.cache.keys():
                    pubdate = self.cache[x]['published']
                    if (timedate.datetime.now() - pubdate >
                            datetime.timedelta(days=5)):
                        del self.cache[x]
        except:
            pass
        if len(self.rss_entries) == 0:
            print("ERROR: empty rss list")
        self.entries = list()
Example #2
0
def league_table(url, cache):
    if url in cache:
        entry = cache[url]
        headers = { 'If-None-Match': entry['etag'] }
    else:
        entry = None
        headers = None

    f = fetch.Fetcher()
    resp = f.get(url, headers=headers)
    if resp.status_code == 304:
        return entry['value']
    if resp.status_code != 200:
        print(resp.status_code)
        return None
    if entry and resp.headers.get('etag') == entry['etag']:
        return entry['value']
    print(f"Cache miss {url}", flush=True)

    parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser()
    parser.parseStr(resp.text)
    tables = parser.getElementsByTagName('table')
    rows = tables[0].getAllChildNodes().getElementsByTagName('tr')
    table = []
    for row in rows[1:-1]:
        data = row.getAllChildNodes().getElementsByTagName('td')
        r = []
        for d in [ 2,3,4,5,6,7,8,10 ]:
            r.append(data[d].textContent)
        r.append(row.attributes['class'] == "gs-o-table__row--break")
        table.append(r)
    time = parser.getElementsByTagName('time')
    time = dateutil.parser.isoparse(time[0].attributes['datetime'])
    league = parser.getElementsByTagName('h1')
    league = league[0].textContent


    value = (league, time, table)
    cache[url] = dict(
        value=value,
        etag=resp.headers.get('etag')
    )

    return value
Example #3
0
def process_records(queue, rule, wb):
    newqueue = []
    for record in queue:
        maybesave(wb, queue)

        url = record.get("url")
        try:
            (fp, filename) = io.get_tempfile()
            f = fetch.Fetcher(mode=record.get("mode"),
                              url=url,
                              filename=filename)
            url = get_url(f, wb, host_filter=rule.get("host_filter"))
            filename = f.filename

            # consider retrying the fetch if it failed
            if f.error and fetch.err.is_temporal(f.error):
                if not record.get("retry"):
                    record["retry"] = True
                    queue.append(record)

            if record.get("mode") == fetch.Fetcher.SPIDER:
                data = open(filename, 'r').read()
                urls = spider.unbox_it_to_ss(spider.findall(data, url))
                urls = urlrewrite.rewrite_urls(url, urls)

                (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb)

            if record.get("mode") == fetch.Fetcher.FETCH:
                shutil.move(filename,
                            io.safe_filename(urlrewrite.url_to_filename(url)))

        except (fetch.DuplicateUrlWarning, fetch.UrlRedirectsOffHost):
            pass
        except KeyboardInterrupt:
            q = queue[queue.index(record):]
            q.extend(newqueue)
            save_session(wb, queue=q)
            sys.exit(1)
        except Exception, exc:
            log_exc(exc, url, wb)
        finally:
Example #4
0
def cricket_scorecard_table(url, cache):
    if url in cache:
        entry = cache[url]
        headers = { 'If-None-Match': entry['etag'] }
    else:
        entry = None
        headers = None

    f = fetch.Fetcher()
    resp = f.get(url, headers=headers)
    if resp.status_code == 304:
        return entry['value']
    if resp.status_code != 200:
        print(f"{resp.status_code} on {url}")
        return None
    if entry and resp.headers.get('etag') == entry['etag']:
        return entry['value']
    print(f"Cache miss {url}", flush=True)

    parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser()
    parser.parseStr(resp.text)
    article = parser.getElementsByTagName('article')[0]

    match = _get_span(article, 'sp-c-fixture__title')
    home_fix = article.getElementsByClassName('sp-c-fixture__team--time-home')
    away_fix = article.getElementsByClassName('sp-c-fixture__team--time-away')
    home_name = _get_span(home_fix, 'sp-c-fixture__team-name-trunc', 'abbr')
    away_name = _get_span(away_fix, 'sp-c-fixture__team-name-trunc', 'abbr')
    home_scores = _get_span(home_fix, 'sp-c-fixture__cricket-score',
                            as_list=True, ignore='gs-u-vh')
    away_scores = _get_span(away_fix, 'sp-c-fixture__cricket-score',
                            as_list=True, ignore='gs-u-vh')

    if home_scores:
        home_scores = [ l.replace('  ',' ').replace(' - ','-').strip()
                        for l in home_scores ]
    else:
        home_scores = []

    if away_scores:
        away_scores = [ l.replace('  ',' ').replace(' - ','-').strip()
                        for l in away_scores ]
    else:
        away_scores = []

    status = _get_span(article, 'sp-c-fixture__win-message')

    innings = []
    for number in range(4):
        bats = parser.getElementById(f"batting-table{number+1}")
        if bats:
            bats = bats.getChildren()
        else:
            continue
        bowls = parser.getElementById(f"bowling-table{number+1}")
        if bowls:
            bowls = bowls.getChildren()
        falls = parser.getElementById(f"fall-of-wicket-table{number+1}")
        if falls:
            falls = falls.getChildren()

        title = _get_span(bats, 'gs-u-align-left', 'h2')
        bat_foot = bats.getElementsByTagName('tfoot')
        tot_overs = _get_span(bat_foot, 'qa-overs')
        tot_runs = _get_span(bat_foot, 'qa-runs', ignore='gs-u-vh')
        bat_body = bats.getElementsByTagName('tbody')
        bat_lines = []
        for r in bat_body.getElementsByTagName('tr'):
            vals = _get_span(r, 'gs-o-table__cell',
                             as_list=True)
            bat_lines.append(vals)

        falls_body = falls.getElementsByTagName('tbody')
        fall_lines = []
        for r in falls_body.getElementsByTagName('tr'):
            vals = _get_span(r, 'gs-o-table__cell',
                             as_list=True)
            fall_lines.append(vals)

        innings.append(
            dict(
                name=title,
                runs=tot_runs,
                overs=tot_overs,
                batting=bat_lines,
                falls=fall_lines,
            )
        )

    metas = parser.getElementById('event-meta').getChildren()
    metas1 = [ f.replace(':', '').lower() for f in
        _get_span(metas, tag_name='dt', as_list=True) ]
    metas2 = _get_span(metas, tag_name='dd', as_list=True)
    metas = dict(list(zip(metas1, metas2)))

    toss = metas['toss']
    toss = toss.replace(' won the ', ' won ').replace(' and decided to ', ': ')

    venue = metas['venue']
    if ',' in venue:
        _,_,venue = venue.rpartition(',')
        venue = venue.strip()

    table = dict(
        match=match,
        home_name=home_name,
        home_scores=home_scores,
        away_name=away_name,
        away_scores=away_scores,
        status=status,
        innings=innings,
        toss=toss,
        venue=venue,
    )

    cache[url] = dict(
        value=table,
        etag=resp.headers.get('etag')
    )

    return table
Example #5
0
def football_gossip_entries(url, cache):
    seen = []
    if url in cache:
        entry = cache[url]
        etag = entry['etag']
        if 'seen' in entry:
            seen = entry['seen']
        headers = { 'If-None-Match': entry['etag'] }
    else:
        etag = None
        entry = None
        headers = None

    f = fetch.Fetcher()

    # "seen" works around the CDN sending different etags occasionally
    resp = f.head(url, headers=headers)
    if resp.status_code == 304:
        return entry['value']
    if resp.status_code == 200:
        newetag = resp.headers.get('etag')
        if newetag in seen:
            cache[url]['etag'] = newetag
            return entry['value']

    resp = f.get(url, headers=headers)
    if resp.status_code == 304:
        return entry['value']
    if resp.status_code != 200:
        print(f"{resp.status_code} on {url}")
        return None
    if entry and (resp.headers.get('etag') == etag):
        return entry['value']
    print(f"Cache miss {url}", flush=True)
    seen.append(resp.headers.get('etag'))
    seen = seen[-10:]

    parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser()
    parser.parseStr(resp.content.decode("utf-8"))

    paragraphs = []
    div = parser.getElementById('story-body')
    children = div.getAllChildNodes().getElementsByTagName('p')
    for p in children:
        head = ""
        line = ""
        tail = ""
        first = True
        for c in p.childBlocks:
            if type(c) is str:
                if c:
                    if first:
                        head += c
                    else:
                        line += c
            else:
                if (c.nodeName == 'b' and first):
                    head += c.textContent
                    first = False
                elif c.nodeName == 'a':
                    tail = c.textContent
                else:
                    line += c.textContent

        line = line.replace('  ', ' ').strip()
        line = line.replace('()','').strip()
        while line.endswith('(') or line.endswith(')'):
            line = line[:-1]
        head = head.replace('  ', ' ').strip()
        tail = tail.replace('(','')
        tail = tail.replace(')','').strip()
        tail = f"({tail})"
        if head and line and tail:
            paragraphs.append((head, line, tail))

    cache[url] = dict(
        value=paragraphs,
        etag=resp.headers.get('etag'),
        seen=seen
    )
    return paragraphs
Example #6
0
def fixtures_table(url, cache):
    if url in cache:
        entry = cache[url]
        headers = { 'If-None-Match': entry['etag'] }
    else:
        entry = None
        headers = None

    f = fetch.Fetcher()
    resp = f.get(url, headers=headers)
    if resp.status_code == 304:
        return entry['value']
    if resp.status_code != 200:
        print(f"{resp.status_code} on {url}")
        return None
    if entry and resp.headers.get('etag') == entry['etag']:
        return entry['value']
    print(f"Cache miss {url}", flush=True)

    parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser()
    parser.parseStr(resp.text)
    divs = parser.getElementsByClassName('qa-match-block')
    table = []
    for div_row in divs:
        children = div_row.getAllChildNodes()
        league = children.getElementsByTagName('h3')
        league = league[0].textContent.upper()
        if league not in _config['football_fixture_leagues']:
            continue
        round_ = children.getElementsByTagName('h4')
        if len(round_):
            round_ = round_[0].textContent
        else:
            round_ = None
        block = []
        matches = children.getElementsByTagName('ul')
        matches = matches[0].getAllChildNodes().getElementsByTagName('li')
        for match in matches:
            nodes = match.getAllChildNodes()
            home_team = _get_span(nodes, 'sp-c-fixture__team-name-trunc',
                                  'abbr',0)
            away_team = _get_span(nodes, 'sp-c-fixture__team-name-trunc',
                                  'abbr',1)
            home_goals = _get_span(nodes, 'sp-c-fixture__number--home')
            away_goals = _get_span(nodes, 'sp-c-fixture__number--away')
            kickoff = _get_span(nodes, 'sp-c-fixture__block--time')
            status = _get_span(nodes, 'sp-c-fixture__aside')
            if not status:
                status = _get_span(nodes, 'sp-c-fixture__status')
            if status:
                if "abandoned" in status:
                    home_goals = "A"
                    away_goals = "A"
                    kickoff = None
                status = status.replace("Match postponed -","")
                status = status.replace("Match abandoned -","")
                status = status.replace(" mins", "min")
                status = status.replace(' ','')
            block.append(dict(
                home_team=home_team,
                away_team=away_team,
                home_goals=home_goals,
                away_goals=away_goals,
                status=status,
                kickoff=kickoff,
            ))

        table.append(dict(
            league=league,
            round_=round_,
            matches=block,
        ))

    cache[url] = dict(
        value=table,
        etag=resp.headers.get('etag')
    )

    return table
Example #7
0
def cricket_fixtures_table(url, cache):
    if url in cache:
        entry = cache[url]
        headers = { 'If-None-Match': entry['etag'] }
    else:
        entry = None
        headers = None

    f = fetch.Fetcher()
    resp = f.get(url, headers=headers)
    if resp.status_code == 304:
        return entry['value']
    if resp.status_code != 200:
        print(f"{resp.status_code} on {url}")
        return None
    if entry and resp.headers.get('etag') == entry['etag']:
        return entry['value']
    print(f"Cache miss {url}", flush=True)

    parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser()
    parser.parseStr(resp.text)
    spans = parser.getElementsByClassName('qa-fixture-block')
    table = []
    for span_row in spans:
        children = span_row.getAllChildNodes()
        series = children.getElementsByTagName('h3')
        series = series[1].textContent.upper()
        if series.upper() not in _config['cricket_series']:
            continue
        block = []
        matches = children.getElementsByTagName('ul')
        matches = matches[0].getAllChildNodes().getElementsByTagName('li')
        for match in matches:
            nodes = match.getAllChildNodes()
            link = nodes.getElementsByTagName('a')
            if link:
                link = link[0].getAttribute('href')

            home_team = _get_span(nodes, 'sp-c-head-to-head__team-name-trunc',
                                  'abbr',0)
            away_team = _get_span(nodes, 'sp-c-head-to-head__team-name-trunc',
                                  'abbr',1)
            home_score = _get_span(nodes, 'sp-c-head-to-head__home-team-scores',
                                   sub_class='sp-c-head-to-head__cricket-score',
                                   as_list=True)
            away_score = _get_span(nodes, 'sp-c-head-to-head__away-team-scores',
                                   sub_class='sp-c-head-to-head__cricket-score',
                                   as_list=True)
            status = _get_span(nodes, 'sp-c-head-to-head__status')
            title = _get_span(nodes, 'sp-c-head-to-head__title')
            venue = _get_span(nodes, 'sp-c-head-to-head__venue')
            time = _get_span(nodes, 'qa-score-time')
            home_batting = _get_span(nodes,
                                     'sp-c-head-to-head__team-indicator--home')
            away_batting = _get_span(nodes,
                                     'sp-c-head-to-head__team-indicator--away')
            block.append(dict(
                home_team=home_team,
                away_team=away_team,
                home_score=home_score,
                away_score=away_score,
                status=status,
                link=link,
                title=title,
                time=time,
                venue=venue,
                home_batting=bool(home_batting),
                away_batting=bool(away_batting),
            ))

        table.append(dict(
            series=series,
            matches=block,
        ))

    cache[url] = dict(
        value=table,
        etag=resp.headers.get('etag')
    )

    return table
Example #8
0
def main(argv=None):
    import sys
    # http://www.python.org/doc/2.4.4/lib/module-time.html
    import time

    if argv is None:
        argv = sys.argv
    options, args = parse_options(argv[1:])

    update_parameters(options.parameter)

    step_list = options.steps
    try:
        rootdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        if os.getcwd() != rootdir:
            raise Fatal("The GISTEMP procedure must be run from the root "
                        "directory of the project.\nPlease change directory "
                        "to %s and try again." % rootdir)

        # Carry out preflight checks and fetch missing files.
        import fetch
        fetcher = fetch.Fetcher()
        fetcher.fetch()

        # Create all the temporary directories we're going to use.
        for d in ['log', 'result', 'work']:
            mkdir(d)

        step_fn = {
            '0': run_step0,
            '1': run_step1,
            '2': run_step2,
            '3': run_step3,
            '3c': run_step3c,
            '4': run_step4,
            '5': run_step5,
        }
        
        # Record start time now, and ending times for each step.
        start_time = time.time()

        cannot = [s for s in step_list if not step_fn.has_key(s)]
        if cannot:
            raise Fatal("Can't run steps %s" % str(cannot))

        # Create a message for stdout.
        if len(step_list) == 1:
            logit = "STEP %s" % step_list[0]
        else:
            assert len(step_list) >= 2
            try:
                t = [str(s) for s in range(step_list[0], step_list[-1]+1)]
            except:
                t = []
            if step_list == t:
                logit = "STEPS %s to %s" % (step_list[0], step_list[-1])
            else:
                logit = "STEPS %s" % ', '.join(step_list)
        log("====> %s  ====" % logit)
        data = None
        for step in step_list:
            data = step_fn[step](data)
        # Consume the data in whatever the last step was, in order to
        # write its output, and hence suck data through the whole
        # pipeline.
        for _ in data:
            pass

        end_time = time.time()
        log("====> Timing Summary ====")
        log("Run took %.1f seconds" % (end_time - start_time))

        return 0
    except Fatal, err:
        sys.stderr.write(str(err))
        sys.stderr.write('\n')
        return 2
Example #9
0
	def fetcher(self):
		if not self._fetcher:
			import fetch
			self._fetcher = fetch.Fetcher(self.handler)
		return self._fetcher
Example #10
0
def dl_input_files():
    import fetch
    fetcher = fetch.Fetcher()
    fetcher.fetch()