def _signal_kill(self, signum, _): signame = signal_names[signum] log.msg('Received %s twice, forcing unclean shutdown' % signame, \ level=log.INFO) log.log_level = log.SILENT # disable logging of confusing tracebacks reactor.callFromThread(self.engine.kill) install_shutdown_handlers(signal.SIG_IGN)
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) variants_price=hxs.select("//div[@class='fleft catbox pricerate']//span/text()").extract() variants_seller=hxs.select("//div[@class='catbox fleft storeimage']/img/@alt").extract() quantitylist=[] pricelist=[] items=[] if (len(variants_price)!=0 or variants_price!=None) and (len(variants_seller) or variants_seller!=None): for price, seller in zip(variants_price, variants_seller): item = BillionPricesIndiaItem() item['date'] = time.strftime("%d/%m/%Y") item['vendor'] = seller.split(" ")[-1:][0] item['product'] = response.url.split('/')[-1].split(".")[0] itemprice=re.sub('[,]', '', price).split(" ")[-1:][0] item['category'] = "mobiles" item['price'] = float(itemprice) item['quantity'] = '1' item['measure']= 'pcs' item['unitprice']=float(itemprice) items.append(item) return items
def process_item(self, item, spider): if self.enabled: item_name = item['name'] if item_name in JmItemValidator.scraped_items: if JmItemValidator.scraped_items[item_name] == item['stored_date']: raise DropItem("Item is scraped.") if isinstance(item, JmPromotionItem): log.msg("get JM promotion item.", log.DEBUG) item.log_self(log.INFO) if item.record_exist(self.conn): log.msg("JmPromotionItem is already exists in mysql DB.", log.WARNING) raise DropItem("JmPromotionItem is already in db") if isinstance(item, JmProductItem): log.msg("get JM product item.", log.DEBUG) item.log_self(log.INFO) if item.record_exist(self.conn): log.msg("JmProductItem is already exists in mysql DB.", log.WARNING) raise DropItem("JmProductItem is already in db") # get the brand_id brand_id = self.get_brand_id(item) if brand_id == None: raise DropItem('brand id is None.') item['brand_id'] = brand_id log.msg("brand id:%s, brand:%s" % (item['brand_id'], item['brand']), log.DEBUG) JmItemValidator.scraped_items[item['name']] = item['stored_date'] return item
def parse_boletin(self, response): hxs = HtmlXPathSelector(response) urls = hxs.select("//div[@id='titular']/a/@href").extract() log.msg('Found %i items' % (len(urls),)) for u in urls: log.msg('Enqueue %s' % (u,)) yield Request(u, callback=self.parse_doc)
def parse(self, response): log.msg(response.url) baseurl=self.__getHostURL(response.url)+"mobiles/" for url in response.xpath('//li[@class="page"]/a/@href').extract(): if url.split("/")[2] not in self.crawledURL: yield scrapy.Request(baseurl + url.split("/")[2], callback=self.search) self.crawledURL.append(url)
def insertItem (self, item, insert_mode = 'replace'): if hasattr(item, 'returning') and item.returning: rtn_clause = "RETURNING %s as return_id" % item.returning else: rtn_clause = "RETURNING 1 as return_id" table_name = item.__class__.__name__ fieldnms = item.keys() values = item.values() try: keyfields = item.keyFields() except AttributeError: keyfields = None # postgres does not do REPALCE so we run this method if insert_mode.lower() == 'replace': return self._do_replace(table_name, fieldnms, values, rtn_clause, keyfields) field_str = '"%s"' % '", "'.join(fieldnms) value_str = ('%s,' * len(values))[:-1] sql = ('%s INTO "%s" (%s) VALUES (%s) %s;' % (insert_mode, table_name, field_str, value_str, rtn_clause)) c = self.db.cursor(cursor_factory=DictCursor) try: c.execute (sql, values) except: log.msg ("insertItem error on %s:\n\t%s" % (table_name, c.mogrify(sql, values)), level=log.INFO)#DEBUG raise return c.fetchone()['return_id']
def updateRecipe(self, session, recipe, item): itemIngredients = item['ingredients'] categories = [] if 'recipeCategory' in item: categories = item['recipeCategory'] # Regenerate the ingredients for the recipe recipe.ingredients = [] session.commit() recipe.fromdict(item) for ing in itemIngredients: log.msg(u'Adding ingredient to recipe {0}: {1}'.format(recipe.id, ing)) ingredient = RecipeIngredients(ingredient=ing) ingredient.recipe_id = recipe.id session.add(ingredient) for cat in categories: category = session.query(Category).filter_by(name=cat).first() if category is None: category = Category(name=cat) session.add(category) session.commit()
def parse_news(self, response): log.msg("Start to parse news " + response.url, level=log.INFO) item = SpiderNewsAllItem() day = title = _type = keywords = url = article = '' url = response.url day = response.meta['day'] title = response.meta['title'] _type = response.meta['_type'] response = response.body soup = BeautifulSoup(response) # try: # items_keywords = soup.find_all(class_='hotword') # for i in range(0, len(items_keywords)): # keywords += items_keywords[i].text.strip() + ' ' # except: # log.msg("News " + title + " dont has keywords!", level=log.INFO) try: article = soup.find(class_='story').text.strip() except: log.msg("News " + title + " dont has article!", level=log.INFO) item['title'] = title item['day'] = day item['_type'] = _type item['url'] = url item['keywords'] = keywords item['article'] = article item['site'] = u'南华早报' return item
def parse(self, response): url = response.url _type = self.get_type_from_url(url) items = [] try: response = response.body soup = BeautifulSoup(response) links = soup.find_all(class_=re.compile('post-area')) except: items.append(self.make_requests_from_url(url)) log.msg("Page " + url + " parse ERROR, try again !", level=log.ERROR) return items need_parse_next_page = True if len(links) > 0: for i in range(0, len(links)): url_news = 'http://www.nanzao.com' + links[i].h2.a['href'] title = links[i].h2.a.text.strip() day = links[i].time['datetime'].replace('-', '') need_parse_next_page = self.is_news_not_saved(title, url_news) if not need_parse_next_page: break items.append(self.make_requests_from_url(url_news).replace(callback=self.parse_news, meta={'_type': _type, 'day': day, 'title': title})) if u'下一頁>' in soup.find(class_='paging').text: page_next = 'http://www.nanzao.com' + soup.find_all("a", text=u"下一頁>")[0]['href'] if need_parse_next_page: items.append(self.make_requests_from_url(page_next)) return items
def parse_cmp_gps(self, response): data = '' cmp = response.meta['Company'] try: data = response.body.decode('GBK') if data == '': yield cmp log.msg(format= '%companyGps.(request)s get fail.response is blank.', level = log.ERROR, request = response.url) return except: yield cmp log.msg(u'返回企业gps结果为非GBK编码网页', level = log.INFO) return try: # match = re.search(r'''lng: (.*),\r''', data, re.I|re.M) if match: cmp['GisLongitude'] = match.group(1) # match = re.search(r'''lat: (.*),\r''', data, re.I|re.M) if match: cmp['GisLatitude'] = match.group(1) yield cmp except: yield cmp
def process_item(self, item, spider): for data in item: if not data: raise DropItem("Missing {0}!".format(data)) self.collection.update({'url': item['url']}, dict(item), upsert=True) log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider) return item
def spidercls_for_request(spidermanager, request, default_spidercls=None, log_none=False, log_multiple=False): """Return a spider class that handles the given Request. This will look for the spiders that can handle the given request (using the spider manager) and return a Spider class if (and only if) there is only one Spider able to handle the Request. If multiple spiders (or no spider) are found, it will return the default_spidercls passed. It can optionally log if multiple or no spiders are found. """ snames = spidermanager.find_by_request(request) if len(snames) == 1: return spidermanager.load(snames[0]) if len(snames) > 1 and log_multiple: log.msg(format='More than one spider can handle: %(request)s - %(snames)s', level=log.ERROR, request=request, snames=', '.join(snames)) if len(snames) == 0 and log_none: log.msg(format='Unable to find spider that handles: %(request)s', level=log.ERROR, request=request) return default_spidercls
def parse_follow_page(self,new_id,category,user): # category - 'followees' or 'follower' user[category] = [] self.driver.get('http://m.zhihu.com/people/%s/%s' % (new_id,category)) try: follow_list = self.driver.find_element_by_css_selector('.zh-general-list') except: if settings.DEBUG_INFO : log.msg("no %s for %s" % (category,new_id),level=log.INFO) return script_content = str(follow_list.get_attribute('data-init')) _xsrf = str(self.driver.find_element_by_name('_xsrf').get_attribute('value')) post_data = ast.literal_eval(script_content) post_data['_xsrf'] = _xsrf post_data['method'] = 'next' del post_data['nodename'] page = 1 while True: print page post_data['params']['offset']=(page-1)*settings.FOLLOW_PER_PAGE follow_res = self.block_ajax_load(settings.AJAX_URL[category],"post",post_data) if not follow_res: break ex = r'"http://www\.zhihu\.com/people/([^"]+)"' extract_ids = re.findall(ex,follow_res) if len(extract_ids) == 0: break for followee_id in extract_ids: user[category].append(re.sub(r'\\','',followee_id)) page += 1
def ip138_parse(self, response): """ 解析ip138上的信息 """ detail_list = response.meta["detail_list"] num = response.meta["num"] item = UnispiderItem() try: assert response.status == 200, "error in ip138_parse assert" sel = response.selector district_code = sel.xpath(u"//td[text()='区 号']/following-sibling::td[1]/text()").extract() detail_list = for_ominated_data(detail_list,district_code) #区号 fox_code = sel.xpath(u"//td[text()='邮 编']/following-sibling::td[1]/text()").extract() detail_list = for_ominated_data(detail_list,fox_code) #邮编 local = sel.xpath(u"//td[text()='卡号归属地']/following-sibling::td[1]/text()").extract() html_parse = HTMLParser.HTMLParser() #local = html_parse.unescape(local) #解析html中的空格 try: local = local[0].strip() local = local.split(html_parse.unescape(" ")) #应老马要求将归属地拆开 except Exception,e: log.msg("归属地_errror %"%"|".join(detail_list),level=log.ERROR) detail_list = for_ominated_data(detail_list, local)#卡号归属地 try: item["content"] = "|".join(detail_list) yield item except Exception,e: log.msg("error detail_list join num=%s, info=%s" %(num, "\001".join(detail_list)))
def parse(self, response): # We'll be given a page here and will have to return # all (up to 12) book items # let's get the category first pgCat = response.xpath('//h2[@id="s-result-count"]/span/a/text()').extract() if len(pgCat) == 0: # sometimes it isn't wrapped in a span pgCat = response.xpath('//h2[@id="s-result-count"]/a/text()').extract() if len(pgCat) == 0: log.msg("Couldn't parse base categories: " + response.url, level=log.WARNING) pgCat = ".".join(pgCat) try: pgCat = pgCat + "." + response.xpath('//h2[@id="s-result-count"]/span/span/text()').extract()[0] except IndexError: try: pgCat = pgCat + "." + response.xpath('//h2[@id="s-result-count"]/span/text()').extract()[0] except IndexError: log.msg("Couldn't parse category title: " + response.url, level=log.WARNING) for result in response.xpath('//div[starts-with(@id, "result_")]'): baseLink = result.xpath('div[@class="data"]/h3/a') url = baseLink.xpath('@href').extract()[0] try: name = baseLink.xpath('text()').extract()[0] except IndexError: name = baseLink.xpath('span[@title]').extract()[0] retVal = AmazonBookOverviewItem() retVal['url'] = url retVal['name'] = name retVal['category'] = pgCat yield retVal
def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = request.headers.get('Referer') msg = 'Image (unknown-error): Error downloading %s from %s referred in <%s>: %s' \ % (self.MEDIA_NAME, request, referer, str(failure)) log.msg(msg, level=log.WARNING, spider=info.spider) raise ImageException
def __init__(self, *args, **kwargs): msg = "Django settings used: %s" % os.environ.get("DJANGO_SETTINGS_MODULE") log.msg(msg, log.INFO) super(DjangoBaseSpider, self).__init__(None, **kwargs) self._check_mandatory_vars()
def __init__(self): """ Constructor """ # Configure the connection self.configure() if self.config['replica_set'] is not None: connection = MongoReplicaSetClient( self.config['uri'], replicaSet=self.config['replica_set'], w=self.config['write_concern'], fsync=self.config['fsync'], read_preference=ReadPreference.PRIMARY_PREFERRED) else: # Connecting to a stand alone MongoDB connection = MongoClient( self.config['uri'], fsync=self.config['fsync'], read_preference=ReadPreference.PRIMARY) # Set up the collection database = connection[self.config['database']] self.collection = database[self.config['collection']] log.msg('Connected to MongoDB {0}, using "{1}/{2}"'.format( self.config['uri'], self.config['database'], self.config['collection'])) # Ensure unique index if self.config['unique_key']: self.collection.ensure_index(self.config['unique_key'], unique=True) log.msg('Ensuring index for key {0}'.format( self.config['unique_key']))
def parse(self, response): print 'do parse function' if response.body.find('feedBackUrlCallBack') != -1: data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1)) userinfo = data.get('userinfo', '') if len(userinfo): log.msg('user id %s' % userinfo['userid'], level=log.INFO) assert userinfo['userid'] == self.username self.logined = True bootstrap = settings.get('BOOTSTRAP') log.msg('bootstrap from %s' % bootstrap, level=log.INFO) # FIXME: use last scheduled time instead of today, otherwise queue filter will not work today = datetime.now() if bootstrap == 'file': lines = tuple(codecs.open('items.txt', 'r', 'utf-8')) for line in lines: if line.startswith("#"): continue start = _epoch() url = QueryFactory.create_timerange_query(urllib.quote(line.encode('utf8')), start, today) request = Request(url=url, callback=self.parse_weibo, meta={ 'query': line, 'start': start.strftime("%Y-%m-%d %H:%M:%S"), 'end': today.strftime("%Y-%m-%d %H:%M:%S"), 'last_fetched': today.strftime("%Y-%m-%d %H:%M:%S")}) yield request else: self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))
def parse(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) items=[] variants_date=hxs.select("//span[@class='normal']//text()").extract() variants_price=hxs.select("//table[@id='objContPreviousPrices_grdPreviousPrices']//tr//td[@class='normal']//text()").extract() price_items=self.__group_iter(variants_price,4) av_price=[] for price_list in price_items: av_price.append(reduce(lambda x, y: float(x) + float(y) / float(len(price_list)), price_list, 0)) for price, date in zip(variants_price, variants_date): item = BillionPricesIndiaItem() quantity='1 lt' item['date'] = date item['vendor'] = "ioc" item['product'] = "gasoline" item['category'] = "oil and gas" value,measure,unitprice=self.__unit_price(price,quantity) item['price'] = price item['quantity'] = value item['measure']= measure item['unitprice']=unitprice items.append(item) return items
def dl_success(self, response, request, item, spider): referer = request.headers.get('Referer') if response.status != 200: msg = ('{cls}: Got ({status}) downloading {request} referred in ' '{referer}'.format(cls=self.__class__.__name__, status=response.status, request=request, referer=referer)) raise TrackPipelineDropItem(msg) if not response.body: msg = ('Empty response body: {request} referred in ' '<{referer}>'.format(request=request, referer=referer)) raise TrackPipelineDropItem(msg) log.msg(format='Downloaded: %(request)s referred in <%(referer)s>', level=log.DEBUG, spider=spider, request=request, referer=referer) spider.crawler.stats.inc_value( '{}/file_download_count'.format(self.__class__.__name__), spider=spider) key_name = '{}/track.mp3'.format(item['track_id']) dfd = threads.deferToThread(self.s3.set_from_string, key_name, response.body) dfd.addCallback(self.ul_success, item, spider) return dfd
def _sent_failed(self, failure, to, cc, subject, nattachs): errstr = str(failure.value) log.msg(format='Unable to send mail: To=%(mailto)s Cc=%(mailcc)s ' 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d' '- %(mailerr)s', level=log.ERROR, mailto=to, mailcc=cc, mailsubject=subject, mailattachs=nattachs, mailerr=errstr)
def process_item(self, item, spider): info = dict(item) # 需要再研究下python list与str的转码的关系,这里的中文给我弄晕了= = jsonstr = json.dumps(info).decode('unicode_escape').replace(" ", "").replace("\n", "") log.msg(jsonstr) self.file.write( "%s,%s,%s\n" % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), jsonstr, str(jsonstr == self.result))) if (item['is_text'] == ''): return item obj = json.loads(jsonstr) """2016-11-1 22:30:00 官网修改字段 新款-经批准后发售 为 新款,按钮仍为disable 更换为依据按钮属性判断 """ if (obj["submit_button"] != ["disabled"]): self.mail_sender.send_mail(jsonstr) self.file.write( "sendmail!") # print info["submit_button"] == [n.encode("utf-8") for n in "disabled"] 这里为什么不相等呢? # if (jsonstr != self.result): # self.mail_sender.send_mail(jsonstr) # self.file.write( # "sendmail!") return item
def process_request(self, request, spider): if spider.to_be_killed: log.msg("Spider has been killed, ignoring request to %s" % request.url, log.DEBUG, spider=spider) # raise IgnoreRequest() return request else: return None
def parse_synonyms(self, sel): """ This function scrapes the list of Names and Identifiers :param sel: a Selector object of the whole page :return: a list of Requests """ requests = [] synonyms = [] # Exact type for this is unknown, but equivalent to Validated by Expert for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'): name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'expert')) # These synonyms are labeled by ChemSpider as "Validated by Experts" for syn in sel.xpath('//p[@class="syn"][strong]'): name = syn.xpath('strong/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'expert')) # These synonyms are labeled by ChemSpider as "Validated by Users" for syn in sel.xpath( '//p[@class="syn"][span[@class="synonym_confirmed"]]'): name = syn.xpath( 'span[@class="synonym_confirmed"]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'user')) # These syonyms are labeled as "Non-validated" and assumed unreliable for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): name = syn.xpath('span[@class=""]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) # [TODO] - confirm if English User-Validated synonyms are OK too for syn in synonyms: if syn['category'] == 'expert' and syn['language'] == 'English': log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) self._spider.get_synonym_requests(syn['name']) return requests
def _debug_set_cookie(self, response, spider): if self.debug: cl = response.headers.getlist("Set-Cookie") if cl: msg = "Received cookies from: %s" % response + os.linesep msg += os.linesep.join("Set-Cookie: %s" % c for c in cl) log.msg(msg, spider=spider, level=log.DEBUG)
def import_to_ckan(created_files): importer = CKANImporter() for f in created_files: m = 'Importing %s' % str(f) log.msg(m, level=log.DEBUG) importer.import_package(f['archivo'], f['modalidad']) log.msg("Paso el importer", level=log.DEBUG)
def parse_parts2(self, response): log.msg("\tparse_parts time: %s" % int(time.time()), level=log.DEBUG) ua = response.request.headers['User-Agent'] log.msg("\tua: %s" % ua, level=log.DEBUG) for part in response.css('table.parts > tbody > tr'): il = ItemLoader(item=CarPart(), selector=part) il.add_xpath('shop_city', "td[@class='shop']/a/text()") il.add_xpath('shop_name', "td[@class='shop']/a/strong/text()") shop_url = il.get_xpath("td[@class='shop']/a/@href", TakeFirst()) photo_url = il.get_xpath("td[@class='photo']/a/@href", TakeFirst()) il.add_value('shop_url', urljoin(self.main_url, shop_url)) il.add_value('ext_link', urljoin(self.main_url, photo_url)) il.add_xpath('info', "td[@class='info']//text()") il.add_xpath('price', "td[@class='price']//text()") il.add_value('brand', response.meta.get('brand')) il.add_value('model', response.meta.get('model')) il.add_value('car_part', response.meta.get('car_part')) il.add_value('category', response.meta.get('category')) item = il.load_item() if item.is_valid(): yield item
def _debug_cookie(self, request, spider): if self.debug: cl = request.headers.getlist("Cookie") if cl: msg = "Sending cookies to: %s" % request + os.linesep msg += os.linesep.join("Cookie: %s" % c for c in cl) log.msg(msg, spider=spider, level=log.DEBUG)
def __init__(self): run_time = str(time.strftime("%y%m%d%H%M%s")) result_table = run_time + '_result' self.result_table = result_table log.msg("Scraped data will store in table: %s" % result_table, level=log.INFO) self.conn = self.db_conn self.cursor = self.conn.cursor() #-- init result_table self.cursor.execute( " CREATE TABLE " + result_table + """( id int AUTO_INCREMENT, locality varchar(255), price FLOAT, size FLOAT, date date, city varchar(255), rent_buy_new varchar(255), item_link text, city_link text, total varchar(255), PRIMARY KEY (id) ) """ ) self.conn.commit()
def process_item(self, item, spider): address = item['address'] or '' if not address or address == 'ABSCONDED': log.msg('Item has no address, skip geocode', level=log.WARNING) return item log.msg('Geocoding address: "%s"' % address) if self.geocoder_cache.has_key(str(address)): log.msg('Geolocation found in cache, using') loc = self.geocoder_cache.get(str(address)) else: try: geo_response = self.geocoder.geocode(address) log.msg('Location found') log.msg(str(geo_response), level=log.DEBUG) loc = { 'address': geo_response.address, 'latitude': geo_response.latitude, 'longitude': geo_response.longitude } except: log.msg('GEOCODING ERROR', level=log.ERROR) return item item['address'] = loc['address'] item['lat'] = loc['latitude'] item['lng'] = loc['longitude'] log.msg('Writing geolocation object to cache') log.msg(str(loc), level=log.DEBUG) self.geocoder_cache[str(address)] = loc # self.geocoder_cache.sync() return item
def open(self, spider): log.msg('Starting frontier', log.INFO) if not self.frontier.manager.auto_start: self.frontier.start()
logger = logging.getLogger('scrapy') >>>>>>> mongodb-branch from IPython.core.debugger import Tracer class RandomProxy(object): def __init__(self, settings): # Tracer()() self.proxy_list = settings.get('PROXY_LIST') <<<<<<< HEAD fin = open(self.proxy_list) self.proxies = {} if len(fin.readlines()) == 0: Tracer()() log.msg('The proxy_list is empty') return for line in fin.readlines(): parts = re.match('(\w+://)(\w+:\w+@)?(.+)', line) if parts is None: Tracer()() log.msg('Did not read the line') return # Cut trailing @ if parts.group(2): user_pass = parts.group(2)[:-1] else: user_pass = ''
def media_downloaded(self, response, request, info): """ Handler for success downloads. """ referer = request.headers.get('Referer') if response.status != 200: log.msg( format= '%(medianame)s (code: %(status)s): Error downloading %(medianame)s from %(request)s referred in <%(referer)s>', level=log.WARNING, spider=info.spider, medianame=self.MEDIA_NAME, status=response.status, request=request, referer=referer) raise BookFileException(request.url, '%s: download-error' % (request.url, )) if not response.body: log.msg( format= '%(medianame)s (empty-content): Empty %(medianame)s from %(request)s referred in <%(referer)s>: no-content', level=log.WARNING, spider=info.spider, medianame=self.MEDIA_NAME, request=request, referer=referer) raise BookFileException(request.url, '%s: empty-content' % (request.url, )) status = 'cached' if 'cached' in response.flags else 'downloaded' log.msg( format= '%(medianame)s (%(status)s): Downloaded %(medianame)s from %(request)s referred in <%(referer)s>', level=log.DEBUG, spider=info.spider, medianame=self.MEDIA_NAME, status=status, request=request, referer=referer) if self.is_valid_content_type(response): raise BookFileException( request.url, '%s: invalid-content_type' % (request.url, )) filename = self.get_file_name(request, response) if not filename: raise BookFileException(request.url, '%s: noaccess-filename' % (request.url, )) self.inc_stats(info.spider, status) try: key = self.file_key( request.url) #return the SHA1 hash of the file url book_file_id, checksum = self.store.persist_file( key, response.body, info, filename) except BookFileException as exc: whyfmt = '%(medianame)s (error): Error processing %(medianame)s from %(request)s referred in <%(referer)s>: %(errormsg)s' log.msg(format=whyfmt, level=log.WARNING, spider=info.spider, medianame=self.MEDIA_NAME, request=request, referer=referer, errormsg=str(exc)) raise return { 'url': request.url, 'book_file_id': book_file_id, 'checksum': checksum }
def close_spider(self, spider, reason): if self._dump: log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self.get_stats()), \ spider=spider) self._persist_stats(self.get_stats(), spider)
def process_response(self, request, response, spider): print('catch url now: %s' % response.url) if response.status == 418: # 如果收到 418,重新发送这个请求 if request.meta['repeat_times'] < self.__repeat_times: request.meta['repeat_times'] = request.meta['repeat_times'] + 1 log.msg(message=time.strftime( "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") + spider.name + ": restart crawl url:" + response.url, level=log.INFO) # time.sleep(self.__sleep_time) return request else: # 如果重复爬取次数已经到达上限,标记该response为'stop_catch'并将其返回至 print(spider.name + " cannot catch this url!") log.msg(message=time.strftime( "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") + spider.name + ": having repeating %d times! url:%s. stop catch!" % self.__repeat_times, level=log.INFO) request.meta['stop_catch'] = True return response try: # 检查返回的json数据是否为空,若为空则表明该页不存在数据或者是爬取数据失败 # 根据相应的情况对response对象进行处理 parse_json = json.loads(response.text) if parse_json['ok'] == 0: if request.meta['repeat_times'] < self.__repeat_times: request.meta[ 'repeat_times'] = request.meta['repeat_times'] + 1 print( "[WeiboSpiderRetryMiddleware] catch empty json file! retry! url:%s" % request.url) log.msg( message=time.strftime( "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") + "Middleware: catch empty json file! retry! url:%s, retry times:%d" % (request.url, request.meta['repeat_times']), level=log.INFO) return request else: # log.msg(message=time.strftime("%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") + # spider.name + " cannot catch this url! url: " + request.url, level=log.INFO) raise IgnoreRequest else: request.meta['parse_json'] = parse_json return response except json.JSONDecodeError: log.msg(message=time.strftime( "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") + "catch html file!", level=log.INFO) if request.meta['repeat_times'] < self.__repeat_times: request.meta['repeat_times'] = request.meta['repeat_times'] + 1 print( "[WeiboSpiderRetryMiddleware] catch empty json file! retry! url:%s" % request.url) log.msg( message=time.strftime( "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") + "Middleware: catch empty json file! retry! url:%s, retry times:%d" % (request.url, request.meta['repeat_times']), level=log.INFO) return request else: print(spider.name + " cannot catch this url! url: " + request.url) raise IgnoreRequest
def __init__(self): log.msg('Initializing geocoder pipeline') self.geocoder = GoogleV3() self.geocoder_cache = {}
def process_response(self, request, response, spider): '''对返回的response处理''' # 如果返回的response状态不是200,重新生成当前request对象 if response.status != 200: log.msg('-' * 10, level=log.ERROR) log.msg(response.url, level=log.ERROR) log.msg(request.body.encode('utf-8'), level=log.ERROR) log.msg(response.status, level=log.ERROR) log.msg(request.meta['proxy'], level=log.ERROR) log.msg('proxy block!', level=log.ERROR) log.msg('-' * 10, level=log.ERROR) proxy = self.get_random_proxy() # 对当前reque加上代理 request.meta['proxy'] = 'http://%s' % proxy return request return response
def process_car(self, cursor, item): """ insert & update Cars """ def process_make(cursor, make): """ check if the make & model exist in the _variations, if they do not, inserting it into _hold to manually process later """ # make make = urllib.unquote_plus(make) sql = "select id from master_makes_variations use index(idx_make) where make = %s;" parameters = (make) cursor.execute(sql, parameters) result = cursor.fetchone() make_id = None if not result: sql = "select id from master_makes_hold use index (idx_make) where make = %s;" parameters = (make) cursor.execute(sql, parameters) result = cursor.fetchone() if not result: sql = "insert into master_makes_hold(make) values (%s);" parameters = (make) cursor.execute(sql, parameters) cursor.execute('commit;') log.msg('[UNFOUND] make - %s' % make, level=log.INFO) return cursor.lastrowid else: return None else: log.msg('[FOUND] make - %s' % make, level=log.INFO) return result['id'] def process_model(cursor, model, make_id): """ check if the model exists in the _variations, if it does not, inserting it into _hold to manually process later """ model = urllib.unquote_plus(model) sql = "select id from master_models_variations use index (idx_model) where model = %s;" parameters = (model) cursor.execute(sql, parameters) result = cursor.fetchone() if not result: sql = "select id from master_models_hold use index (idx_model) where model = %s;" parameters = (model) cursor.execute(sql, parameters) result = cursor.fetchone() if not result: sql = "insert into master_models_hold(model, fk_make) values (%s, %s);" parameters = (model, str(make_id)) cursor.execute(sql, parameters) cursor.execute('commit;') log.msg('[UNFOUND] model - %s' % model, level=log.INFO) else: log.msg('[FOUND] model - %s' % model, level=log.INFO) # Check if Car's Vin is existed sql = "select RowNum from master_vin use index(Idx_VIN) where VIN = %s limit 1;" if item.get('vin') is not None: parameters = (item.get('vin')) else: parameters = ("") cursor.execute(sql, parameters) result = cursor.fetchone() if result: # Vin is duplicated, then set target table is _history target_table = "_history" else: # Vin is new, then set target table is _cars target_table = "_cars" # check if Car's ID is existed sql = "".join(("select id from ", item.get('site'), "_cars where id = %s limit 1;")) parameters = (item.get('url_id')) cursor.execute(sql, parameters) result = cursor.fetchone() if not result: # joining site and target_table to choose correct data table and then insert a new Car sql = "".join(("insert into ", item.get('site'), target_table ,"(id, description, `year`, make, trim, model, price, bodystyle,\ exterior_color, interior_color, `engine`, stock_id, vin, mileage, transmission, drive_type, doors, fuel, cab, stereo, dealer, street_number, \ street_name, city, state, zip_code, phone, source_url, found_by) \ values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")) parameters = ( item.get('url_id'), item.get('description'), item.get('year'), item.get('make'), item.get('trim'), item.get('model'), item.get('price'), item.get('body_style'), item.get('exterior_color'), item.get('interior_color'), item.get('engine'), item.get('stock_id'), item.get('vin'), item.get('mileage'), item.get('transmission'), item.get('drive_type'), item.get('doors'), item.get('fuel_type'), item.get('cab_type'), item.get('stereo'), item.get('dealer'), item.get('street_number'), item.get('street_name'), item.get('city'), item.get('state'), item.get('zip_code'), item.get('phone'), item.get('source_url'), item.get('found_by') ) cursor.execute(sql, parameters) log.msg('[ADDED] %s at %s EST' % (item['description'], datetime.now(timezone('US/Eastern')) .strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO) # call make post-processing make_id = None if item.get('make') != "": make_id = process_make(cursor, item.get('make')) if item.get('model') != "" and make_id is not None: process_model(cursor, item.get('model'), make_id) else: log.msg("[WARNING] Multiple Checking - %s" % item['url_id'], level=log.INFO)
from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from superdeals.items import SuperdealsItem import socket ip = socket.gethostbyname(socket.gethostname()) from scrapy import log log.msg('ip= ' + ip) class SuperDealsSpider(BaseSpider): """ Base spider which defines the url's to be scraped """ name = "superdeal" allowed_domains = [ "http://www.homeshop18.com/", "http://www.flipkart.com", "http://www.infibeam.com/", "http://www.tradus.com/", "http://www.indiatimes.com" ] start_urls = [ "http://www.homeshop18.com/superdeals/", "http://www.flipkart.com/offers/electronics", "http://www.flipkart.com/offers/fashion", "http://www.flipkart.com/offers/books-and-more", "http://www.infibeam.com/Hot_Deals/search", "http://www.tradus.com/deals", "http://shopping.indiatimes.com/deals/"
def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: log.msg('Current UserAgent: ' + ua, level=log.INFO) request.headers.setdefault('User-Agent', ua)
def parse_product(self, response): #inspect_response(response, self) #return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) tmp = hxs.select('//span[@class="product_code"]/text()').extract() if tmp: loader.add_value('identifier', tmp[0].strip()) loader.add_value('sku', tmp[0]) else: log.msg('### No product ID at ' + response.url, level=log.INFO) return #tmp = hxs.select('//input[@name="productId"]/@value').extract() #if tmp: # loader.add_value('sku', tmp[0]) name = '' tmp = hxs.select('//span[@itemprop="name"]/text()').extract() if tmp: name = tmp[0].strip() loader.add_value('name', name) else: log.msg('### No name at ' + response.url, level=log.INFO) #price price = 0 stock = 0 tmp = hxs.select('//span[@itemprop="price"]/text()').extract() if not tmp: tmp = hxs.select( '//table[@id="product-info-table"]/tr[@class="price"]/td/span[1]/text()' ).extract() if tmp: price = extract_price(tmp[0].strip().replace(',', '')) loader.add_value('price', price) stock = 1 #stock #stock = 0 #tmp = hxs.select('//td[strong="In Stock: "]/text()').extract() #if tmp and 'yes' in ''.join(tmp).lower(): # stock = 1 loader.add_value('stock', stock) #image_url tmp = hxs.select('//img[@id="product_photo"]//@src').extract() if tmp: url = urljoin(response.url, tmp[0].strip()) loader.add_value('image_url', url) #brand tmp = hxs.select( '//span[@itemprop="description"]//b[1]/text()').extract() if tmp: loader.add_value('brand', tmp[0].replace('Collection', '').strip()) #category tmp = hxs.select('//div[@class="breadbox"]/div[1]/a/text()').extract() if len(tmp) > 1: for s in tmp[1:]: loader.add_value('category', s) #shipping_cost if Decimal(price) < 49.95: loader.add_value('shipping_cost', '8.95') product = loader.load_item() options = hxs.select( '//table[@id="options_table"]//select/option[@value!="0"]') #No options currently. if not options: if not product.get('identifier', None): log.msg('### No product ID at ' + response.url, level=log.INFO) else: if not product['identifier'] in self.id_seen: self.id_seen.append(product['identifier']) yield product else: log.msg('### Duplicate product ID at ' + response.url, level=log.INFO) return #process options for sel in options: ### item = copy.deepcopy(product) tmp = sel.select('./text()').extract() if tmp: item['identifier'] += '-' + tmp[0].replace(' ', '_') item['name'] = name + ' - ' + tmp[0] if not item.get('identifier', None): log.msg('### No product ID at ' + response.url, level=log.INFO) else: if not item['identifier'] in self.id_seen: self.id_seen.append(item['identifier']) yield item else: log.msg('### Duplicate product ID at ' + response.url, level=log.INFO)
sql_insert = """insert into """ + table + """(`url`, `source`, `title`, `time`, `content`, `types`) values (%s, %s, %s, %s, %s, %s)""" try: cursor.execute(sql_insert, (item['url'], item['source'], item['title'], item['time'], item['content'], item['types'])) self.conn.commit() log.msg("successfully commit url: %s" % item['url'], level=log.INFO) except MySQLdb.Error, e: print "MySQLdb.Error %d: %s" % (e.args[0], e.args[1]) self.conn.rollback() log.msg("except for DBWriterPipeline", level=log.WARNING) finally: log.msg("passing DBWriterPipeline, content len=%d" % len(item['content']), level=log.INFO) # return item def open_spider(self, spider): log.msg("call open_spider...", level=log.INFO) self.conn = MySQLdb.connect(user='******', passwd='root', db='news', host='localhost', charset='utf8', use_unicode=True) def close_spider(self, spider): log.msg("call close_spider...", level=log.INFO) self.conn.close()
def process_request(self, request, spider): useragent = self._useragents[spider] rp = self.robot_parser(request, spider) if rp and not rp.can_fetch(useragent, request.url): log.msg("Forbidden by robots.txt: %s" % request, log.DEBUG) raise IgnoreRequest
def parse_parliament_steps(self, response): """ Callback function to parse the additional 'Parlamentarisches Verfahren' page """ law_item = response.meta['law_item'] phases = LAW.PHASES.xt(response) for phase in phases: # Create phase if we don't have it yet phase_item, created = Phase.objects.get_or_create( title=phase['title']) if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title)))) # Create steps for step in phase['steps']: step_item, created = Step.objects.update_or_create( title=step['title']['text'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'][0] if step['protocol_url'] else u'', law=law_item, phase=phase_item, source_link=response.url) step_item.save() # Save statements for this step, if applicable if 'statements' in step['title']: for stmnt in step['title']['statements']: # Find the person pq = Person.objects.filter( source_link__endswith=stmnt['person_source_link']) if pq.exists() and pq.count() == 1: person_item = pq.first() st_data = { 'speech_type': stmnt['statement_type'], 'protocol_url': stmnt['protocol_link'][0] if stmnt['protocol_link'] else None } st_item, st_created = Statement.objects.update_or_create( index=stmnt['index'], person=person_item, step=step_item, defaults=st_data) # if st_created: # log.msg(u"Created Statement by {} on {}".format( # green( # u'[{}]'.format(person_item.full_name)), # step_item.date)) # else: # log.msg(u"Updated Statement by {} on {}".format( # green( # u'[{}]'.format(person_item.full_name)), # step_item.date)) else: # We can't save statements if we can't find the # Person log.msg( red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}" ).format( green(u'[{}]'.format( stmnt['person_name'])), blue("[{}]".format( stmnt['person_source_link'])), red("{}").format( "" if pq.exists() else " not"), "" if pq.count() > 1 else ", but {} persons matching found!". format(pq.count()))) continue
def close_spider(self, spider): log.msg("call close_spider...", level=log.INFO) self.conn.close()
def close_spider(self, spider): if self.sender: log.msg('disconnect zmq') self.sender.term()
def parse(self, response): self.SCRAPED_COUNTER += 1 LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) status = LAW.STATUS.xt(response) if not self.IGNORE_TIMESTAMP and not self.has_changes( parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Law {} of {}, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Extract foreign keys category = LAW.CATEGORY.xt(response) description = LAW.DESCRIPTION.xt(response) # Create category if we don't have it yet cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # Create and save Law law_data = { 'title': title, 'status': status, 'description': description, 'ts': ts } law_item, law_created = Law.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, source_link=response.url, defaults=law_data) # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = cat law_item.documents = self.parse_docs(response) law_item.save() # Log our progress if law_created: logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}" logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER, red(title), cyan(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) response.meta['law_item'] = law_item # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'): self.parse_pre_parliament_steps(response)
def close_spider(self, spider): if self.cnn: log.msg('disconnect mongodb') self.cnn.close() self.cnn = None
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) thumb_url = response.request.meta['thumb'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) #print "show_id: %s" % show_id #space maybe exist: "albumId:326754200" or "albumId: 326754200" albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath( '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()' ).extract() category = response.xpath( '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract( ) if not category: category = response.xpath( '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()' ).extract() if not category: category = response.xpath( '//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath( '//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()' ).extract() upload_time = response.xpath( '//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath( '//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath( '//span[@id="widget-videotag"]/descendant::*/text()').extract( ) if not tag: tag = response.xpath( '//span[@class="mod-tags_item vl-block"]/descendant::*/text()' ).extract() if not tag: tag = response.xpath( '//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() #if category: # ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority if albumid: items.append( Request(url=self.playlength_url + albumid[0], callback=self.parse_playlength, meta={ 'item': ep_item, 'albumid': albumid[0] })) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def insert_profile(self, item): cursor = self.cursor() # udpate crawled_url cursor.execute('select sno from crawled_urls use index(url) where url = %s limit 1', (item['profile_url'])) result = cursor.fetchone() if result: crawled_urls_sno = result['sno'] else: cursor.execute("insert into crawled_urls (url) values (%s)", (item['profile_url'])) log.msg(" - [P] %s" % item['profile_url'], level=log.DEBUG) crawled_urls_sno = cursor.lastrowid cursor.execute('select sno from linkedin_profiles use index(profile_url) where profile_url = %s limit 1', (item['profile_url'])) result = cursor.fetchone() if not result: cursor.execute(\ "insert into linkedin_profiles (crawled_urls_sno, profile_url, title, first_name, last_name, locality, region,\ country, desc_short, profile_pic, num_connection, email, phone, twitter_username, department, recommendations,\ im, address, birthday, marital_status, created)\ values (%s, %s, %s, %s, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s, %s)", ( crawled_urls_sno, item.get('profile_url'), item.get('title'), item.get('first_name'), item.get('last_name'), item.get('locality'), item.get('region'), item.get('country'), item.get('desc_short'), item.get('profile_pic'), item.get('num_connection'), item.get('email'), item.get('phone'), item.get('twitter_username'), item.get('department'), item.get('recomandations'), item.get('im'), item.get('address'), item.get('birthday'), item.get('marital_status'), datetime.now() ) ) profile_sno = cursor.lastrowid # education for edu in item['education']: cursor.execute(\ 'insert into linkedin_education (profile_sno, date_start, date_end, degree, organization)\ values (%s,%s,%s,%s,%s)', ( profile_sno, self.string_to_date(edu.get('date_start')), self.string_to_date(edu.get('date_end')), edu.get('degree'), edu.get('organization') ) ) # experience for exp in item['experience']: cursor.execute(\ 'insert into linkedin_experience (profile_sno, date_start, date_end, title, organization, description)\ values (%s,%s,%s,%s,%s, %s)', ( profile_sno, self.string_to_date(exp.get('date_start')), self.string_to_date(exp.get('date_end')), exp.get('title'), exp.get('organization'), exp.get('description') ) ) # skills for ski in item['skills']: cursor.execute(\ 'insert into linkedin_skills (profile_sno, skill, no_endorsements, first_skill_ind)\ values (%s,%s, %s, %s)', ( profile_sno, ski.get('skill'), ski.get('no_endorsements'), ski.get('first_skill_ind'), ) ) # specialties for spe in item['specialties']: cursor.execute(\ 'insert into linkedin_specialities (profile_sno, specialty)\ values (%s,%s)', ( profile_sno, spe.get('specialty', None) ) ) # websites for w in item['websites']: cursor.execute(\ 'insert into linkedin_websites (profile_sno, website, cate)\ values (%s, %s, %s)', ( profile_sno, w.get('website'), w.get('cate') ) ) # interests for w in item['interests']: cursor.execute(\ 'insert into linkedin_interests (profile_sno, interest)\ values (%s,%s)', ( profile_sno, w.get('interest') ) ) # groups for w in item['groups']: cursor.execute(\ 'insert into linkedin_groups (profile_sno, group_url, organization)\ values (%s,%s,%s)', ( profile_sno, w.get('group_url'), w.get('organization') ) ) # honors for w in item['honors']: cursor.execute(\ 'insert into linkedin_honors (profile_sno, honor)\ values (%s,%s)', ( profile_sno, w.get('honor'), ) ) cursor.execute('commit') log.msg(" - [Added] profile: %s" % item['profile_url'], level=log.INFO) # update crawled_urls cursor.execute(\ 'update crawler_urls use index(found_urls) set stat= %s where found_urls = %s limit 1', ('C', item['profile_url']) ) log.msg(" - [W->C] %s" % item['profile_url'], level=log.DEBUG)
def close_spider(self, spider): if self.cnn: log.msg('disconnect mysql') self.cur.close() self.cnn.close() self.cnn = self.cur = None
def process_request(self, request, spider): user_agent = random.choice(self.user_agent_list) if user_agent: log.msg('Current UserAgent:' + user_agent) request.headers.setdefault('User-Agent', user_agent)
def update_profile(self, item): #print item cursor = self.cursor() cursor.execute('select sno from linkedin_profiles use index(profile_url) where profile_url = %s limit 1', (item['profile_url'])) result = cursor.fetchone() if not result: #print "insert mode" return self.insert_profile(item) profile_sno = result['sno'] # Updating sql = 'update linkedin_profiles set ' for key in item.keys(): if isinstance(item[key], list): continue sql += '%s = "%s", ' %( key, item[key]) #sql = sql[:len(sql) -2] sql += ' updated = "%s"' % datetime.now() sql += ' where sno = %s limit 1' % profile_sno MySQLdb.escape_string(sql) cursor.execute(sql) cursor.execute('commit') # education cursor.execute('delete from linkedin_education where profile_sno = %s', profile_sno) for edu in item['education']: cursor.execute(\ 'insert into linkedin_education (profile_sno, date_start, date_end, degree, organization)\ values (%s,%s,%s,%s,%s)', ( profile_sno, self.string_to_date(edu.get('date_start')), self.string_to_date(edu.get('date_end')), edu.get('degree'), edu.get('organization') ) ) # experience cursor.execute('delete from linkedin_experience where profile_sno = %s', profile_sno) for exp in item['experience']: cursor.execute(\ 'insert into linkedin_experience (profile_sno, date_start, date_end, title, organization, description)\ values (%s,%s,%s,%s,%s, %s)', ( profile_sno, self.string_to_date(exp.get('date_start')), self.string_to_date(exp.get('date_end')), exp.get('title'), exp.get('organization'), exp.get('description') ) ) # skills cursor.execute('delete from linkedin_skills where profile_sno = %s', profile_sno) for ski in item['skills']: cursor.execute(\ 'insert into linkedin_skills (profile_sno, skill, no_endorsements, first_skill_ind)\ values (%s,%s, %s, %s)', ( profile_sno, ski.get('skill'), ski.get('no_endorsements'), ski.get('first_skill_ind'), ) ) # specialties cursor.execute('delete from linkedin_specialities where profile_sno = %s', profile_sno) for spe in item['specialties']: cursor.execute(\ 'insert into linkedin_specialities (profile_sno, specialty)\ values (%s,%s)', ( profile_sno, spe.get('specialty', None) ) ) # websites cursor.execute('delete from linkedin_websites where profile_sno = %s', profile_sno) for w in item['websites']: cursor.execute(\ 'insert into linkedin_websites (profile_sno, website, cate)\ values (%s,%s, %s)', ( profile_sno, w.get('website'), w.get('cate') ) ) # interests cursor.execute('delete from linkedin_interests where profile_sno = %s', profile_sno) for w in item['interests']: cursor.execute(\ 'insert into linkedin_interests (profile_sno, interest)\ values (%s,%s)', ( profile_sno, w.get('interest') ) ) # groups cursor.execute('delete from linkedin_groups where profile_sno = %s', profile_sno) for w in item['groups']: cursor.execute(\ 'insert into linkedin_groups (profile_sno, group_url, organization)\ values (%s,%s,%s)', ( profile_sno, w.get('group_url'), w.get('organization') ) ) # honors cursor.execute('delete from linkedin_honors where profile_sno = %s', profile_sno) for w in item['honors']: cursor.execute(\ 'insert into linkedin_honors (profile_sno, honor)\ values (%s,%s)', ( profile_sno, w.get('honor'), ) ) cursor.execute('commit') log.msg(" - [Updated] profile: %s" % item['profile_url'], level=log.INFO) log.msg(" - [Updated] crawled_urls: %s" % item['profile_url'], level=log.DEBUG)
def process_item(self, item, spider): self.db[self.collection_name].insert(dict(item)) log.msg("Movie added to MongoDB database!", level=log.DEBUG, spider=spider) return item
def parse_Data(self, response): log.msg('Going to parse data for URL: %s' % response.url[20:], level=log.INFO) league = response.meta['league'] jsonResp = json.loads(response.body) jsonData = jsonResp['FetchSubcategoryBetgroupGroupingsResult'] if jsonData == 'null': log.msg('Null response for leauge %s at site: %s' % (league, response.url), level=log.ERROR) return None try: jsonEvents = jsonData['scbgg_c'][0]['m_c'] except (KeyError, TypeError): log.msg('No events for league %s with id %s' '. Is jsonData empty?' % (league['name'].encode('utf-8'), league['id']), level=log.ERROR) log.msg(jsonData, level=log.ERROR) return None items = [] for jsonEvent in jsonEvents: l = EventLoader(item=EventItem2(), response=response) l.add_value('sport', u'Football') l.add_value('bookie', self.name) dateTime = jsonEvent['dd'] l.add_value('dateTime', dateTime) eventName = jsonEvent['n'] if eventName: teams = eventName.lower().split(' - ') l.add_value('teams', teams) # MO prices MOdict = {'marketName': 'Match Odds'} home_price = draw_price = away_price = None for jsonOdd in jsonEvent['ms_c']: if jsonOdd['dn'] == u'1': home_price = jsonOdd['os'] elif jsonOdd['dn'] == u'X': draw_price = jsonOdd['os'] elif jsonOdd['dn'] == u'2': away_price = jsonOdd['os'] MOdict['runners'] = [ { 'runnerName': 'HOME', 'price': home_price }, { 'runnerName': 'DRAW', 'price': draw_price }, { 'runnerName': 'AWAY', 'price': away_price }, ] # Add markets l.add_value('markets', [ MOdict, ]) # Load item items.append(l.load_item()) if not items: items = None return items
def search(self,response): log.msg(response.url) for url in response.xpath('//li[@class="list_view"]//a/@href').extract(): if url not in self.detailedCrawled: yield scrapy.Request(url, callback=self.detail) self.crawledURL.append(url)
def log(self, message, spider, level=log.DEBUG): """Log the given messages at the given log level. Stolen from Spider.""" # prepend the name of this class to message message = '[' + self.__class__.__name__ + '] ' + message log.msg(message, spider=spider, level=level)
def parse_Data(self, response): log.msg('Going to parse data for URL: %s' % response.url[20:], level=log.INFO) l = EventLoader(item=EventItem2(), response=response) l.add_value('sport', u'Football') l.add_value('bookie', self.name) dateTime = take_first( response.xpath('//div[@id="center_content"]/' 'div[@class="coupon_header scrollable"]/' 'div[@class="coupon_header_titles"]/' 'h4/span/text()').extract()) l.add_value('dateTime', dateTime) eventName = take_first( response.xpath('//div[@id="center_content"]/' 'div[@class="coupon_header scrollable"]/' 'div[@class="coupon_header_titles"]/' 'h1/@title').extract()) if eventName: teams = eventName.lower().split(' v ') l.add_value('teams', teams) # Markets mkts = response.xpath( '//div[@class="single_markets" or @class="multiple_markets"]/' 'div[starts-with(@id, "coupon")]') allmktdicts = [] for mkt in mkts: marketName = take_first(mkt.xpath('h4/text()').extract()) mdict = {'marketName': marketName, 'runners': []} runners = mkt.xpath( 'table[not(@class="has_group_date")]/' 'tbody/tr[not(@class="header")]/td[@class="outcome_td"]') for runner in runners: runnerName = take_first( runner.xpath('span/@data-outcome_description').extract()) price = take_first( runner.xpath( 'span/a/span[@class="price"]/text()').extract()) mdict['runners'].append({ 'runnerName': runnerName, 'price': price }) allmktdicts.append(mdict) # Do some Betvic specific post processing and formating for mkt in allmktdicts: if 'Match Betting' in mkt['marketName']: mkt['marketName'] = 'Match Odds' for runner in mkt['runners']: if teams[0] in runner['runnerName'].lower(): runner['runnerName'] = 'HOME' elif teams[1] in runner['runnerName'].lower(): runner['runnerName'] = 'AWAY' elif 'Draw' in runner['runnerName']: runner['runnerName'] = 'DRAW' elif 'Correct Score - 90 Mins' in mkt['marketName']: mkt['marketName'] = 'Correct Score' for runner in mkt['runners']: if teams[1] in runner['runnerName'].lower(): runner['reverse_tag'] = True else: runner['reverse_tag'] = False # Add markets l.add_value('markets', allmktdicts) # Load item return l.load_item()
# NYT_INT_HOME = NYInternationalHomeSpider() # FT = FinancialTimeSpider() # HBR = HBRSpider() # HN = HNSpider() # DISCOVER_MAG = DiscoverMagSpider() # TC = TechCrunchSpider() # config init settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() # crawler init crawler.crawl(CRAIG) # crawler.crawl(MIT_TECH) # crawler.crawl(NYT_HOME) # crawler.crawl(NYT_INT_HOME) # crawler.crawl(FT) # crawler.crawl(HBR) # crawler.crawl(HN) # crawler.crawl(DISCOVER_MAG) # crawler.crawl(TC) # crawler start crawler.start() log.start() log.msg('Reactor activated...') reactor.run() log.msg('Reactor stopped.')