def crawl_all(): articles = Articles() visited = set() coroutines = [parser.feed_urls() for parser in parsers] for coroutine in asyncio.as_completed(coroutines): urls = list(map(canonicalize_url, (yield from coroutine))) if len(urls) < 1: continue parser = get_parser(urls[0]) log.info('Got {} URLs for {}'.format(len(urls), parser.domain)) to_get = [parser(x).parse() for x in urls if x not in visited] visited = visited.union(urls) for get_page in asyncio.as_completed(to_get): try: page = yield from get_page articles.save_entry(page) except Exception as e: log.error(e) urls = get_existing_urls(articles) to_get = [get_parser(x)(x).parse() for x in urls if x not in visited] log.info("updating {} existing unvisited URLs".format(len(to_get))) for get_page in asyncio.as_completed(to_get): try: page = yield from get_page articles.save_entry(page) except Exception as e: log.error(e)
def parser(self, value): if isinstance(value, parsers.Parser): self._parser = value else: try: self.parser = parsers.get_parser(value) except NameError as e: raise ValueError("parser value '"+str(value)+"' must be an instance of parsers.Parser or a string allowed by parsers.get_parser.")
def load_article(url): try: parser = parsers.get_parser(url) except KeyError: logger.info('Unable to parse domain, skipping') return parsed_article = parser(url) if not parsed_article.real_article: return return parsed_article
def parser(self, value): if isinstance(value, parsers.Parser): self._parser = value else: try: self.parser = parsers.get_parser(value) except NameError as e: raise ValueError( "parser value '" + str(value) + "' must be an instance of parsers.Parser or a string allowed by parsers.get_parser." )
def test_parsing(self): """Simple format, simple test""" parser = parsers.get_parser('loadavg') results = dict() parser.parse(self.example, results) stats = results['stats'] self.assertEqual(0.15, stats.one_minute_load) self.assertEqual(0.22, stats.five_minute_load) self.assertEqual(0.43, stats.fifteen_minute_load) self.assertEqual(1, stats.running_threads) self.assertEqual(650, stats.total_threads) self.assertEqual(3914, stats.last_pid)
def load_article(url): try: parser = parsers.get_parser(url) except KeyError: logger.info('Unable to parse domain, skipping') return try: parsed_article = parser(url) except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e: if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone': return logger.error('Exception when parsing %s', url) logger.error(traceback.format_exc()) logger.error('Continuing') return
def test_parse_meminfo(self): print(self.example) parser = parsers.get_parser('meminfo') results = parser.parse(self.example, dict()) meminfo = results['meminfo'] self.assertEqual(45, len(meminfo.meminfo)) self.assertEqual(20507388 * 1024, meminfo.get('MemTotal')) self.assertEqual(8326068 * 1024, meminfo.get('MemFree')) self.assertEqual(20559872 * 1024, meminfo.get('DirectMap2M')) self.assertEqual(1, meminfo.get('HugePages_Total')) self.assertEqual(2, meminfo.get('HugePages_Free')) self.assertEqual(3, meminfo.get('HugePages_Rsvd')) self.assertEqual(4, meminfo.get('HugePages_Surp'))
def load_article(url): try: parser = get_parser(url) except KeyError: log.info('Unable to parse domain, skipping') return try: parsed_article = parser(url) except (AttributeError, urllib.request.HTTPError, Exception) as e: if isinstance(e, urllib.request.HTTPError) and e.msg == 'Gone': return log.error('Exception when parsing %s', url) log.error(traceback.format_exc()) log.error('Continuing') return if not parsed_article.real_article: return return parsed_article
def _parse_section(section_name, current_process, current_thread, data, out): try: parser = parsers.get_parser(section_name) parser.parse(data, out) except: pass if current_thread and section_name == 'stat': _save_stat(current_thread, out['stat']) elif current_process and section_name != '': # Hit a new file, consolidate what we have so far. if 'smaps' == section_name: _save_smaps_region(current_process.maps, out['meminfo'], current_process.pid, data) elif 'cmdline' == section_name: # Some command lines have a number of empty arguments. Ignore # that because it's not interesting here. current_process.argv = filter(len, data.strip().split('\0')) elif 'stat' == section_name: _save_stat(current_process, out['stat']) else: LOGGER.error('Unrecognised section name: %s' % section_name)
def test_process_stat_parsing(self): parser = parsers.get_parser('uptime') stats = parser.parse(self.example, dict())['stats'] self.assertEqual(84983.36, stats.uptime) self.assertEqual(434057.28, stats.uptime_idle)
from methods import Bisection, FixedPoint, NewtonRaphson from parsers import get_parser if __name__ == '__main__': parser = get_parser() args = parser.parse_args() function = lambda x: 3143680 - 2 * 2**x - 51200 * x derivative = lambda x: (-2**x) * (0.69314718) - 51200 error_esperado = 0.00001 if args.method == 'biseccion': method = Bisection(function, args.a, args.b) if args.method == 'punto-fijo': method = FixedPoint(function, args.a, args.b) if args.method == 'newton-raphson': method = NewtonRaphson(function, derivative, args.inicial) for i, valor in enumerate(method, 1): if method.get_error() <= error_esperado: break print('Se obtuvo el valor ', valor, ' en ', i, ' iteraciones')
def test_stat_parsing(self): parser = parsers.get_parser('stat') res = parser.parse(self.example, dict())['stat'] self.assertEqual(res['comm'], 'tracker-miner-f', 'comm not as expected? {0}'.format(res['comm'])) self.assertEqual(res, self.expected, 'Did not parse /proc/stat correctly?')
$ python test_parser.py nyt.NYTParser <one of those URLs> [text of article to store] """ import sys import parsers try: parsername = sys.argv[1] except IndexError: print 'Usage: test_parser.py <modulename>.<classname> [<url_to_check>]' sys.exit() try: url = sys.argv[2] except IndexError: url = None module, classname = parsername.rsplit('.', 1) parser = getattr(__import__(module, globals(), fromlist=[classname]), classname) if url: assert type(parsers.get_parser(url)) == type(parser) parsed_article = parser(url) print unicode(parsed_article) else: links = parser.feed_urls() links = [link for link in links if parser.filter(link)] print '\n'.join(links)
def setUp(self): self.parser = get_parser() self.function = lambda x: x**2 - 2 self.derivative = lambda x: 2 * x