コード例 #1
0
def crawl_all():
    articles = Articles()
    visited = set()
    coroutines = [parser.feed_urls() for parser in parsers]
    for coroutine in asyncio.as_completed(coroutines):
        urls = list(map(canonicalize_url, (yield from coroutine)))
        if len(urls) < 1:
            continue
        parser = get_parser(urls[0])
        log.info('Got {} URLs for {}'.format(len(urls), parser.domain))
        to_get = [parser(x).parse() for x in urls if x not in visited]
        visited = visited.union(urls)
        for get_page in asyncio.as_completed(to_get):
            try:
                page = yield from get_page
                articles.save_entry(page)
            except Exception as e:
                log.error(e)
    urls = get_existing_urls(articles)
    to_get = [get_parser(x)(x).parse() for x in urls if x not in visited]
    log.info("updating {} existing unvisited URLs".format(len(to_get)))
    for get_page in asyncio.as_completed(to_get):
        try:
            page = yield from get_page
            articles.save_entry(page)
        except Exception as e:
            log.error(e)
コード例 #2
0
	def parser(self, value):
		if isinstance(value, parsers.Parser):
			self._parser = value
		else:
			try:
				self.parser = parsers.get_parser(value)
			except NameError as e:
				raise ValueError("parser value '"+str(value)+"' must be an instance of parsers.Parser or a string allowed by parsers.get_parser.")
コード例 #3
0
ファイル: crawl_by_url.py プロジェクト: pdphuong/soclust
def load_article(url):
    try:
        parser = parsers.get_parser(url)
    except KeyError:
        logger.info('Unable to parse domain, skipping')
        return
    parsed_article = parser(url)
    if not parsed_article.real_article:
        return
    return parsed_article
コード例 #4
0
 def parser(self, value):
     if isinstance(value, parsers.Parser):
         self._parser = value
     else:
         try:
             self.parser = parsers.get_parser(value)
         except NameError as e:
             raise ValueError(
                 "parser value '" + str(value) +
                 "' must be an instance of parsers.Parser or a string allowed by parsers.get_parser."
             )
コード例 #5
0
 def test_parsing(self):
     """Simple format, simple test"""
     parser = parsers.get_parser('loadavg')
     results = dict()
     parser.parse(self.example, results)
     stats = results['stats']
     self.assertEqual(0.15, stats.one_minute_load)
     self.assertEqual(0.22, stats.five_minute_load)
     self.assertEqual(0.43, stats.fifteen_minute_load)
     self.assertEqual(1, stats.running_threads)
     self.assertEqual(650, stats.total_threads)
     self.assertEqual(3914, stats.last_pid)
コード例 #6
0
def load_article(url):
    try:
        parser = parsers.get_parser(url)
    except KeyError:
        logger.info('Unable to parse domain, skipping')
        return
    try:
        parsed_article = parser(url)
    except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e:
        if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone':
            return
        logger.error('Exception when parsing %s', url)
        logger.error(traceback.format_exc())
        logger.error('Continuing')
        return
コード例 #7
0
ファイル: scraper.py プロジェクト: caseyg/cooperdiffs
def load_article(url):
    try:
        parser = parsers.get_parser(url)
    except KeyError:
        logger.info('Unable to parse domain, skipping')
        return
    try:
        parsed_article = parser(url)
    except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e:
        if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone':
            return
        logger.error('Exception when parsing %s', url)
        logger.error(traceback.format_exc())
        logger.error('Continuing')
        return
コード例 #8
0
    def test_parse_meminfo(self):
        print(self.example)

        parser = parsers.get_parser('meminfo')
        results = parser.parse(self.example, dict())
        meminfo = results['meminfo']

        self.assertEqual(45, len(meminfo.meminfo))
        self.assertEqual(20507388 * 1024, meminfo.get('MemTotal'))
        self.assertEqual(8326068 * 1024, meminfo.get('MemFree'))
        self.assertEqual(20559872 * 1024, meminfo.get('DirectMap2M'))

        self.assertEqual(1, meminfo.get('HugePages_Total'))
        self.assertEqual(2, meminfo.get('HugePages_Free'))
        self.assertEqual(3, meminfo.get('HugePages_Rsvd'))
        self.assertEqual(4, meminfo.get('HugePages_Surp'))
コード例 #9
0
def load_article(url):
    try:
        parser = get_parser(url)
    except KeyError:
        log.info('Unable to parse domain, skipping')
        return
    try:
        parsed_article = parser(url)
    except (AttributeError, urllib.request.HTTPError, Exception) as e:
        if isinstance(e, urllib.request.HTTPError) and e.msg == 'Gone':
            return
        log.error('Exception when parsing %s', url)
        log.error(traceback.format_exc())
        log.error('Continuing')
        return
    if not parsed_article.real_article:
        return
    return parsed_article
コード例 #10
0
def _parse_section(section_name, current_process, current_thread, data, out):

    try:
        parser = parsers.get_parser(section_name)
        parser.parse(data, out)
    except:
        pass

    if current_thread and section_name == 'stat':
        _save_stat(current_thread, out['stat'])
    elif current_process and section_name != '':
        # Hit a new file, consolidate what we have so far.
        if 'smaps' == section_name:
            _save_smaps_region(current_process.maps, out['meminfo'],
                               current_process.pid, data)
        elif 'cmdline' == section_name:
            # Some command lines have a number of empty arguments. Ignore
            # that because it's not interesting here.
            current_process.argv = filter(len, data.strip().split('\0'))
        elif 'stat' == section_name:
            _save_stat(current_process, out['stat'])
        else:
            LOGGER.error('Unrecognised section name: %s' % section_name)
コード例 #11
0
 def test_process_stat_parsing(self):
     parser = parsers.get_parser('uptime')
     stats = parser.parse(self.example, dict())['stats']
     self.assertEqual(84983.36, stats.uptime)
     self.assertEqual(434057.28, stats.uptime_idle)
コード例 #12
0
from methods import Bisection, FixedPoint, NewtonRaphson
from parsers import get_parser

if __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()
    function = lambda x: 3143680 - 2 * 2**x - 51200 * x
    derivative = lambda x: (-2**x) * (0.69314718) - 51200
    error_esperado = 0.00001

    if args.method == 'biseccion':
        method = Bisection(function, args.a, args.b)
    if args.method == 'punto-fijo':
        method = FixedPoint(function, args.a, args.b)
    if args.method == 'newton-raphson':
        method = NewtonRaphson(function, derivative, args.inicial)

    for i, valor in enumerate(method, 1):
        if method.get_error() <= error_esperado:
            break

    print('Se obtuvo el valor ', valor, ' en ', i, ' iteraciones')
コード例 #13
0
 def test_stat_parsing(self):
     parser = parsers.get_parser('stat')
     res = parser.parse(self.example, dict())['stat']
     self.assertEqual(res['comm'], 'tracker-miner-f', 'comm not as expected? {0}'.format(res['comm']))
     self.assertEqual(res, self.expected, 'Did not parse /proc/stat correctly?')
コード例 #14
0
ファイル: test_parser.py プロジェクト: fortianyou/newsdiffs
$ python test_parser.py nyt.NYTParser <one of those URLs>
[text of article to store]
"""

import sys
import parsers

try:
    parsername = sys.argv[1]
except IndexError:
    print 'Usage: test_parser.py <modulename>.<classname> [<url_to_check>]'
    sys.exit()

try:
    url = sys.argv[2]
except IndexError:
    url = None

module, classname = parsername.rsplit('.', 1)
parser = getattr(__import__(module, globals(), fromlist=[classname]),
                 classname)

if url:
    assert type(parsers.get_parser(url)) == type(parser)
    parsed_article = parser(url)
    print unicode(parsed_article)
else:
    links = parser.feed_urls()
    links = [link for link in links if parser.filter(link)]
    print '\n'.join(links)
コード例 #15
0
 def setUp(self):
     self.parser = get_parser()
     self.function = lambda x: x**2 - 2
     self.derivative = lambda x: 2 * x