def parse(self, response): headers = ['import os', 'import sys', "sys.path.append(os.path.join('..', '..', '..', '..'))", 'from collections import defaultdict', 'import mycat_tree as mt', 'class PartTree():', ' cat_tree = defaultdict(dict)'] with open(os.path.join(self.base_path, 'part_tree_%s.py' % time.strftime('%m%d%Y')), 'w') as f: for header in headers: f.write(header+'\n') headers = ["myrootcat = ''\n", "mycat = ''\n", "mysubcat = mt.mytree[myrootcat][mycat]\n\n", "subcats = ["] for cat in init.cats: cat = '/'+cat+'/' DigikeyCount.total_subcats += len(response.xpath(xp.PREP_SUBCAT % cat).extract()) print_prog('Progress', DigikeyCount.subcats_count, DigikeyCount.total_subcats, left_just=15, endwith='\r') with open(os.path.join(self.base_path, 'part_tree_%s.py' % time.strftime('%m%d%Y')), 'a') as f: for cat in init.cats: cat = '/'+cat+'/' cat_name = replace_illchar(response.xpath(xp.PREP_CATNAME % cat).extract_first()) subcat_links = response.xpath(xp.PREP_SUBCAT_LINK % cat).extract() subcat_names = response.xpath(xp.PREP_SUBCAT_NAMES % cat).extract() for header in headers: f.write(' '*4+header) for subcat_link, subcat_name in zip(subcat_links, subcat_names): f.write(' '*15+"'%s',\n" % replace_illchar(subcat_name)) item = catsItem() item['cat_name'] = cat_name item['subcat_name'] = replace_illchar(subcat_name) yield scrapy.Request(url='https://www.digikey.com'+subcat_link, callback=self.update_counts, meta={'item': item}) f.write(']\n') f.write(' '*4+'mysub_idx = ['+' ,'*(len(subcat_names)-1)+']\n') f.write(' '*4+'for idx, subcat in zip(mysub_idx, subcats):\n') f.write(' '*8+"cat_tree['%s'][subcat] = {'myrootcat': myrootcat, \ 'mycat': mycat, 'mysubcat': mysubcat[idx]}\n\n" % cat_name)
def drill_down(self, response): subcat_names = response.xpath(xp.DRILL_SUBCAT_NAMES).extract() subcat_links = response.xpath(xp.DRILL_SUBCAT_LINKS).extract() if subcat_names: for subcat_name, subcat_link in zip(subcat_names, subcat_links): if self.cat_required(subcat_link): url = urljoin(self.base_url, subcat_link) db.prepdb.insert_one({ 'dist': self.name.split('_')[0], 'link': url, 'path': urlparse(url).path }) db.prepdb.update_one( { 'dist': self.name.split('_')[0], 'total_count': { '$exists': 1 } }, {'$inc': { 'total_count': 1 }}) self.update_prog() yield scrapy.Request(url=url, callback=self.drill_down) else: cat_name = replace_illchar( response.xpath(xp.BREAD_CRUMBS1).extract_first()) subcat_name = replace_illchar( response.xpath(xp.BREAD_CRUMBS2).extract()[-1]) self.__class__.cat_tree[cat_name].add(subcat_name) self.update_counts( cat_name, replace_illchar(response.xpath(xp.PREP_COUNTS).extract_first(), to_type=int), subcat_name, response.url)
def parse(self, response): cat = replace_illchar(response.xpath(xp.CAT).extract_first().strip()) subcat = response.xpath(xp.SUBCAT).extract_first() subcat = replace_illchar(subcat[subcat.rindex(';') + 1:subcat.rindex('<')].strip()) table_len = len(response.xpath(xp.TABLE_LEN)) table_heads = {} idx = xp.HEAD_START_IDX while True: head = response.xpath(xp.HEAD_TITLE % idx).extract() if head: table_heads[idx] = head[0] idx += 1 else: break for idx in range(1, table_len + 1): part = { 'part_num': replace_illchar( response.xpath(xp.PART_NUM % idx).extract_first().strip()), 'dist_num': response.xpath(xp.DIST_NUM % idx).extract_first().strip(), 'manufac': replace_illchar( response.xpath(xp.MANUFAC % idx).extract_first()), 'descr': response.xpath(xp.DESCR % idx).extract_first().strip(), 'unit_price': response.xpath(xp.UNIT_PRICE % idx).extract_first().strip().replace(',', ''), 'dist': self.name, 'dist_partlink': 'http://www.digikey.com' + response.xpath(xp.DIST_LINK % idx).extract_first(), 'pdf_link': response.xpath(xp.PDF_LINK % idx).extract_first(), 'min_quan': response.xpath(xp.MIN_QUAN % idx).extract_first().strip().replace(',', ''), 'date_scraped': datetime.datetime.utcnow() } for title_idx, table_title in table_heads.items(): misc_data = response.xpath( xp.MISC_DATA % (idx, title_idx)).extract_first().strip() if misc_data: text = '%s %s' % (table_title, misc_data) # Special case for Digikey package dimensions if 'Size' in text and 'Dimension' in text and 'L' in text and 'W' in text: self.text_to.reset_result() # Explicit decoration self.err_to.logger(text, 'info', part['part_num'])( self.text_to.parse)(text) text = [ 'Size Dimension Length %s%s' % (self.text_to.result['param'][2][0], self.text_to.result['param'][2][1]), 'Size Dimension Width %s%s' % (self.text_to.result['param'][3][0], self.text_to.result['param'][3][1]) ] docs = list( self.docs.make_mongoelem( ext_elems_to_analyze=text)) else: docs = [ list( self.docs.make_mongoelem( ext_elems_to_analyze=[text]))[0] ] for sub_idx, doc in enumerate(docs): col = '%s.%s' % (title_idx, sub_idx) doc = { **doc, **{ 'direct': 'v', 'manufac': part['manufac'], 'page_num': 0, 'part_num': part['part_num'], 'dist_num': part['dist_num'], 'col': col, } } doc['word'] += lp.Parser.get_ngrams(doc['part_num']) \ + lp.Parser.get_ngrams(doc['manufac']) db.partdb.update_one( { 'dist_num': part['dist_num'], 'col': col }, {'$setOnInsert': doc}, upsert=True) try: if part['unit_price'][0] == '$': part['unit_price'] = float(part['unit_price'][1:]) else: part['unit_price'] = float(part['unit_price']) except IndexError: part['unit_price'] = 'See distributer' except ValueError: pass if part['pdf_link']: if part['pdf_link'][0:7] == '//media': part['pdf_link'] = 'http:' + part['pdf_link'] else: part['pdf_link'] = '' part['partnum_manufac_ngram3'] = lp.Parser.get_ngrams(part['part_num']) \ + lp.Parser.get_ngrams(part['manufac']) part['root_cat'] = pt.cat_tree[cat][subcat]['myrootcat'] part['cat'] = pt.cat_tree[cat][subcat]['mycat'] part['subcat'] = pt.cat_tree[cat][subcat]['mysubcat'] db.invendb.update_one( { 'root_cat': part['root_cat'], 'cat': part['cat'], 'subcat': part['subcat'] }, {'$inc': { 'total': 1 }}, upsert=True) always_update = { 'unit_price': part['unit_price'], 'min_quan': part['min_quan'], 'date_scraped': part['date_scraped'], 'pdf_link': part['pdf_link'], 'dist_partlink': part['dist_partlink'] } db.distdb.update_one( { 'part_num': part['part_num'], 'manufac': part['manufac'] }, { '$setOnInsert': { k: v for k, v in part.items() if k not in always_update.keys() }, '$set': always_update }, upsert=True) db.manufacdb.update_one({'manufac_ngram3': part['manufac']}, { '$setOnInsert': { 'manufac': part['manufac'], 'manufac_ngram3': lp.Parser.get_ngrams(part['manufac']) } }, upsert=True) processed_pdf = db.metadb.find_one( {'upart': part['part_num'] + '__' + part['manufac']}) if part['pdf_link'] and (not processed_pdf or processed_pdf['processed'] == 'error'): item = partsItem() item['part_num'] = part['part_num'] item['manufac'] = part['manufac'] yield scrapy.Request(url=part['pdf_link'], callback=self.save_pdf, meta={'item': item}) DigikeySpider.prog_count += 1 DigikeySpider.batch_count += 1 #self.htmlpage_count += 1 print_prog(self.name + ' Total progress', DigikeySpider.prog_count, DigikeySpider.total_count, left_just=20, endwith='') print_prog(' ' * 5 + 'Progress', DigikeySpider.batch_count, cfg.BATCH_SIZE, left_just=5, bar_length=20, endwith='\r') next_page = response.xpath(xp.NEXT_PAGE).extract_first() if next_page: next_page = 'https://www.digikey.com' + next_page db.cursordb.update_one( { 'ucat': cat + '__' + subcat, 'dist': DigikeySpider.name }, { '$set': { 'current_link': next_page }, '$inc': { 'current_count': table_len } }) if DigikeySpider.batch_count < cfg.BATCH_SIZE: yield scrapy.Request(url=next_page, callback=self.parse) else: start_link = db.cursordb.find_one({ 'ucat': cat + '__' + subcat, 'dist': DigikeySpider.name })['start_link'] db.cursordb.update_one( { 'ucat': cat + '__' + subcat, 'dist': DigikeySpider.name }, { '$set': { 'current_link': start_link, 'scan_complete': True }, '$inc': { 'current_count': table_len } })
def parse(self, response): if os.path.isdir('.stop_spider'): print(f'\nStop crawling spider {self.name}') raise scrapy.exceptions.CloseSpider('Stopped by user') bread_crumbs = response.xpath(xp.BREAD_CRUMBS).extract() cat, subcat = bread_crumbs[self.bread_crumb_idx:] cat, subcat = replace_illchar(cat), replace_illchar(subcat) table_len = len(response.xpath(xp.TABLE_LEN)) table_heads = {} idx = xp.HEAD_START_IDX while True: head = response.xpath(xp.HEAD_TITLE % idx).extract() if head: table_heads[idx] = head[0].strip() idx += 1 else: break try: for idx in range(self.start_row_idx, table_len+1): part = { 'part_num': replace_illchar(response.xpath(xp.PART_NUM % idx).extract_first().strip()), 'dist_num': response.xpath(xp.DIST_NUM % idx).extract_first().strip(), 'manufac': replace_illchar(response.xpath(xp.MANUFAC % idx).extract_first()), 'descr': response.xpath(xp.DESCR % idx).extract_first().strip(), 'unit_price': replace_illchar(response.xpath(xp.UNIT_PRICE % idx).extract_first().strip()), 'dist': self.name, 'dist_partlink': urljoin(self.base_url, response.xpath(xp.DIST_LINK % idx).extract_first()), 'pdf_link': response.xpath(xp.PDF_LINK % idx).extract_first(), 'min_quan': response.xpath(xp.MIN_QUAN % idx).extract_first().strip().replace(',', ''), 'date_scraped': datetime.datetime.utcnow() } for title_idx, table_title in table_heads.items(): misc_data = response.xpath(xp.MISC_DATA % (idx, title_idx)).extract_first().strip() if misc_data: text = f'{table_title} {misc_data}' docs = [list(self.docs.make_mongoelem(ext_elems_to_analyze=[text]))[0]] for sub_idx, doc in enumerate(docs): col = '%s.%s' % (title_idx, sub_idx) doc = {**doc, **{'direct': 'v', 'manufac': part['manufac'], 'page_num': 0, 'part_num': part['part_num'], 'dist_num': part['dist_num'], 'col': col, }} doc['word'] += get_ngrams(doc['part_num']) \ + get_ngrams(doc['manufac']) db.partdb.update_one({'dist_num': part['dist_num'], 'col': col}, {'$setOnInsert': doc}, upsert=True) for _ in [('unit_price', float), ('min_quan', int)]: try: part[_[0]] = _[1](part[_[0]]) except ValueError: if not part[_[0]]: part[_[0]] = 'See distributer' if not part['pdf_link']: part['pdf_link'] = '' part['partnum_manufac_ngram3'] = get_ngrams(part['part_num']) \ + get_ngrams(part['manufac']) part['root_cat'] = pt.cat_tree[cat][subcat]['myrootcat'] part['cat'] = pt.cat_tree[cat][subcat]['mycat'] part['subcat'] = pt.cat_tree[cat][subcat]['mysubcat'] db.invendb.update_one( {'root_cat': part['root_cat'], 'cat': part['cat'], 'subcat': part['subcat']}, {'$inc': {'total': 1}}, upsert=True ) always_update = {'unit_price': part['unit_price'], 'min_quan': part['min_quan'], 'date_scraped': part['date_scraped'], 'pdf_link': part['pdf_link'], 'dist_partlink': part['dist_partlink']} db.distdb.update_one( {'part_num': part['part_num'], 'manufac': part['manufac']}, {'$setOnInsert': {k: v for k, v in part.items() if k not in always_update.keys()}, '$set': always_update}, upsert=True ) db.manufacdb.update_one( {'manufac': part['manufac']}, {'$setOnInsert': {'manufac': part['manufac'], 'manufac_ngram3': get_ngrams( part['manufac'])}}, upsert=True ) processed_pdf = db.distdb.find_one({'part_num': part['part_num'], 'manufac': part['manufac'], 'processed': {'$exists': True}}) if part['pdf_link'] and (not processed_pdf or processed_pdf['processed'] == 'error'): db.distdb.update_one({'part_num': part['part_num'], 'manufac': part['manufac']}, {'$set': {'processed': 'pending', 'page': 0}}) if cfg.KEEP_GRIDFS_PDF: item = PartsItem() item['part_num'] = part['part_num'] item['manufac'] = part['manufac'] yield scrapy.Request(url=part['pdf_link'], callback=self.save_pdf, meta={'item': item}) else: self.logger.info('### Saved PDF link %s__%s ###' % ( part['part_num'], part['manufac'] )) self.__class__.prog_count += 1 self.__class__.batch_count += 1 print_prog(self.name+' total progress', self.prog_count, self.total_count, left_just=20, endwith='') print_prog(' '*5+'Progress', self.batch_count, cfg.BATCH_SIZE, left_just=5, bar_length=20, endwith='\r') next_page = response.xpath(xp.NEXT_PAGE).extract_first() dup_req = False except AttributeError: next_page = response.url self.logger.debug(f'---> Got invalid data from {response.url}, trying again with {next_page}') dup_req = True if next_page: next_page = urljoin(self.base_url, next_page) if dup_req: # Don't update count, only link db.cursordb.update_one({'ucat': cat+'__'+subcat, 'dist': self.name}, {'$set': {'current_link': next_page}}) else: db.cursordb.update_one({'ucat': cat+'__'+subcat, 'dist': self.name}, {'$set': {'current_link': next_page}, '$inc': {'current_count': table_len}}) if self.batch_count < cfg.BATCH_SIZE: yield scrapy.Request(url=next_page, callback=self.parse, dont_filter=dup_req) else: start_link = db.cursordb.find_one( {'ucat': cat+'__'+subcat, 'dist': self.name} )['start_link'] db.cursordb.update_one({'ucat': cat+'__'+subcat, 'dist': self.name}, {'$set': {'current_link': start_link, 'scan_complete': True}, '$inc': {'current_count': table_len}})