def extractLinks(url, depth, maxDepth): if Utils.isValidURL(url) and url not in readLinks: readLinks.append(url) webInfo = Crawler.getInfoFromOneWeb(url) print('===== Working on: ' + url + ' --- depth: ' + str(depth) + ' --- ' + str(len(webInfo['links'])) + ' links were found') p = Page(url, '', depth, url, webInfo['links'], [], []) dc.addData('collectionoftest', p.returnLikeObject()) depth = depth + 1 if depth < maxDepth: for newUrl in webInfo['links']: MainCrawler.extractLinks(newUrl, depth, maxDepth) else: return
def wikit(): num = 0 if len(sys.argv) > 1: db = xodb.open('test_et4', writable=False) last = db.backend.get_doccount() with bz2.BZ2File('enwiki-latest-pages-articles.xml.bz2') as f: current = None batch = [] for event, element in etree.iterparse(f, events=('start', 'end')): if event == 'start': if element.tag.endswith('page'): current = Page() redirect = False if element.tag.endswith('title'): current.title = element.text if element.tag.endswith('text'): if element.text: current.text = element.text if element.tag.endswith('redirect'): redirect = True if event == 'end': if element.tag.endswith('page'): # num += 1 # if num < last: # print 'Skipping ', num, " ", last, " ", current.title # element.clear() # continue if current.text and current.title and not redirect: if not current.title.startswith( ('Template:', 'Category:', 'File:')): print "Pumping ", current.title try: batch.append(current) if len(batch) > BATCH_SIZE: source.send(dumps(batch)) batch = [] except Exception: log.exception('wtf') element.clear()
def wikit(): num = 0 if len(sys.argv) > 1: db = xodb.open('test_et4', writable=False) last = db.backend.get_doccount() with bz2.BZ2File('enwiki-latest-pages-articles.xml.bz2') as f: current = None batch = [] for event, element in etree.iterparse(f, events=('start', 'end')): if event == 'start': if element.tag.endswith('page'): current = Page() redirect = False if element.tag.endswith('title'): current.title = element.text if element.tag.endswith('text'): if element.text: current.text = element.text if element.tag.endswith('redirect'): redirect = True if event == 'end': if element.tag.endswith('page'): # num += 1 # if num < last: # print 'Skipping ', num, " ", last, " ", current.title # element.clear() # continue if current.text and current.title and not redirect: if not current.title.startswith(('Template:', 'Category:', 'File:')): print "Pumping ", current.title try: batch.append(current) if len(batch) > BATCH_SIZE: source.send(dumps(batch)) batch = [] except Exception: log.exception('wtf') element.clear()
from lxml import etree from schemas import Page, PageSchema log = logging.getLogger(__name__) db = xodb.open('/home/michel/xap/ms_test2') db.map(Page, PageSchema) redirects = {} with bz2.BZ2File('enwiki-latest-pages-articles.xml.bz2') as f: current = None for event, element in etree.iterparse(f, events=('start', 'end')): if event == 'start': if element.tag.endswith('page'): current = Page() redirect = False if element.tag.endswith('title'): current.title = element.text if element.tag.endswith('text'): if element.text: current.text = element.text if element.tag.endswith('redirect'): redirect = True if event == 'end': if element.tag.endswith('page'): if current.text and current.title and not redirect: try: db.add(current) except Exception: log.exception('wtf')
log = logging.getLogger(__name__) db = xodb.open('/home/michel/xap/ms_test2') db.map(Page, PageSchema) redirects = {} with bz2.BZ2File('enwiki-latest-pages-articles.xml.bz2') as f: current = None for event, element in etree.iterparse(f, events=('start', 'end')): if event == 'start': if element.tag.endswith('page'): current = Page() redirect = False if element.tag.endswith('title'): current.title = element.text if element.tag.endswith('text'): if element.text: current.text = element.text if element.tag.endswith('redirect'): redirect = True if event == 'end': if element.tag.endswith('page'): if current.text and current.title and not redirect: try: db.add(current) except Exception: log.exception('wtf')