Ejemplo n.º 1
0
 def _signal_kill(self, signum, _):
     signame = signal_names[signum]
     log.msg('Received %s twice, forcing unclean shutdown' % signame, \
         level=log.INFO)
     log.log_level = log.SILENT # disable logging of confusing tracebacks
     reactor.callFromThread(self.engine.kill)
     install_shutdown_handlers(signal.SIG_IGN)
Ejemplo n.º 2
0
    def detail(self, response):
        log.msg(response.url)
        hxs = HtmlXPathSelector(response)
        variants_price=hxs.select("//div[@class='fleft catbox pricerate']//span/text()").extract()
        variants_seller=hxs.select("//div[@class='catbox fleft storeimage']/img/@alt").extract()
        quantitylist=[]
        pricelist=[]
        items=[]


        if (len(variants_price)!=0 or variants_price!=None) and (len(variants_seller) or  variants_seller!=None):
            for price, seller in zip(variants_price, variants_seller):
                item = BillionPricesIndiaItem()
                item['date'] = time.strftime("%d/%m/%Y")
                item['vendor'] = seller.split(" ")[-1:][0]
                item['product'] = response.url.split('/')[-1].split(".")[0]
                itemprice=re.sub('[,]', '', price).split(" ")[-1:][0]
                item['category'] = "mobiles"
                item['price'] = float(itemprice)
                item['quantity'] = '1'
                item['measure']= 'pcs'
                item['unitprice']=float(itemprice)

                items.append(item)
        return items
Ejemplo n.º 3
0
    def process_item(self, item, spider):
        if self.enabled:
            item_name = item['name']
            if item_name in JmItemValidator.scraped_items:
                if JmItemValidator.scraped_items[item_name] == item['stored_date']:
                    raise DropItem("Item is scraped.")

            if isinstance(item, JmPromotionItem):
                log.msg("get JM promotion item.", log.DEBUG)
                item.log_self(log.INFO)
                if item.record_exist(self.conn):
                    log.msg("JmPromotionItem is already exists in mysql DB.", log.WARNING)
                    raise DropItem("JmPromotionItem is already in db")

            if isinstance(item, JmProductItem):
                log.msg("get JM product item.", log.DEBUG)
                item.log_self(log.INFO)
                if item.record_exist(self.conn):
                    log.msg("JmProductItem is already exists in mysql DB.", log.WARNING)
                    raise DropItem("JmProductItem is already in db")

                # get the brand_id
                brand_id = self.get_brand_id(item)
                if brand_id == None:
                    raise DropItem('brand id is None.')

                item['brand_id'] = brand_id
                log.msg("brand id:%s, brand:%s" % (item['brand_id'], item['brand']), log.DEBUG)
            JmItemValidator.scraped_items[item['name']] = item['stored_date']
        return item
Ejemplo n.º 4
0
 def parse_boletin(self, response):
     hxs = HtmlXPathSelector(response)
     urls = hxs.select("//div[@id='titular']/a/@href").extract()
     log.msg('Found %i items' % (len(urls),))
     for u in urls:
         log.msg('Enqueue %s' % (u,))
         yield Request(u, callback=self.parse_doc)
Ejemplo n.º 5
0
 def parse(self, response):
     log.msg(response.url)
     baseurl=self.__getHostURL(response.url)+"mobiles/"
     for url in response.xpath('//li[@class="page"]/a/@href').extract():
         if url.split("/")[2] not in self.crawledURL:
             yield scrapy.Request(baseurl + url.split("/")[2], callback=self.search)
             self.crawledURL.append(url)
Ejemplo n.º 6
0
    def insertItem (self, item, insert_mode = 'replace'):
        if hasattr(item, 'returning') and item.returning:
            rtn_clause = "RETURNING %s as return_id" % item.returning
        else:
            rtn_clause = "RETURNING 1 as return_id"

        table_name = item.__class__.__name__
        fieldnms = item.keys()
        values = item.values()
        try:
            keyfields = item.keyFields()
        except AttributeError:
            keyfields = None

        # postgres does not do REPALCE so we run this method
        if insert_mode.lower() == 'replace':
            return self._do_replace(table_name,
                                    fieldnms,
                                    values,
                                    rtn_clause,
                                    keyfields)

        field_str = '"%s"' % '", "'.join(fieldnms)
        value_str = ('%s,' * len(values))[:-1]
        sql = ('%s INTO "%s" (%s) VALUES (%s) %s;'
               % (insert_mode, table_name, field_str, value_str, rtn_clause))
        c = self.db.cursor(cursor_factory=DictCursor)
        try:
            c.execute (sql, values)
        except:
            log.msg ("insertItem error on %s:\n\t%s"
                     % (table_name, c.mogrify(sql, values)),
                     level=log.INFO)#DEBUG
            raise
        return c.fetchone()['return_id']
Ejemplo n.º 7
0
def updateRecipe(self, session, recipe, item):
  itemIngredients = item['ingredients']
  categories = []

  if 'recipeCategory' in item:
    categories = item['recipeCategory']

  # Regenerate the ingredients for the recipe
  recipe.ingredients = []
  session.commit()

  recipe.fromdict(item)

  for ing in itemIngredients:
    log.msg(u'Adding ingredient to recipe {0}: {1}'.format(recipe.id, ing))
    ingredient = RecipeIngredients(ingredient=ing)
    ingredient.recipe_id = recipe.id
    session.add(ingredient)

  for cat in categories:
    category = session.query(Category).filter_by(name=cat).first()

    if category is None:
      category = Category(name=cat)
      session.add(category)
      session.commit()
Ejemplo n.º 8
0
 def parse_news(self, response):
     log.msg("Start to parse news " + response.url, level=log.INFO)
     item = SpiderNewsAllItem()
     day = title = _type = keywords = url = article = ''
     url = response.url
     day = response.meta['day']
     title = response.meta['title']
     _type = response.meta['_type']
     response = response.body
     soup = BeautifulSoup(response)
     # try:
     #     items_keywords = soup.find_all(class_='hotword')
     #     for i in range(0, len(items_keywords)):
     #         keywords += items_keywords[i].text.strip() + ' '
     # except:
     #     log.msg("News " + title + " dont has keywords!", level=log.INFO)
     try:
         article = soup.find(class_='story').text.strip()
     except:
         log.msg("News " + title + " dont has article!", level=log.INFO)
     item['title'] = title
     item['day'] = day
     item['_type'] = _type
     item['url'] = url
     item['keywords'] = keywords
     item['article'] = article
     item['site'] = u'南华早报'
     return item
Ejemplo n.º 9
0
    def parse(self, response):
    	url = response.url
    	_type = self.get_type_from_url(url)
    	items = []
    	try:
            response = response.body
            soup = BeautifulSoup(response)
            links = soup.find_all(class_=re.compile('post-area'))
        except:
            items.append(self.make_requests_from_url(url))
            log.msg("Page " + url + " parse ERROR, try again !", level=log.ERROR)
            return items
        need_parse_next_page = True
        if len(links) > 0:
        	for i in range(0, len(links)):
        		url_news = 'http://www.nanzao.com' + links[i].h2.a['href']
        		title = links[i].h2.a.text.strip()
        		day = links[i].time['datetime'].replace('-', '')
        		need_parse_next_page = self.is_news_not_saved(title, url_news)
        		if not need_parse_next_page:
					break
        		items.append(self.make_requests_from_url(url_news).replace(callback=self.parse_news, meta={'_type': _type, 'day': day, 'title': title}))
        	if u'下一頁>' in soup.find(class_='paging').text:
				page_next = 'http://www.nanzao.com' + soup.find_all("a", text=u"下一頁>")[0]['href']
				if need_parse_next_page:
					items.append(self.make_requests_from_url(page_next))
        	return items
Ejemplo n.º 10
0
 def parse_cmp_gps(self, response):
     data = ''
     cmp = response.meta['Company']
     try:
         data = response.body.decode('GBK')
         if data == '':
             yield cmp
             log.msg(format= '%companyGps.(request)s get fail.response is blank.', level = log.ERROR, request = response.url)
             return
     except:
         yield cmp
         log.msg(u'返回企业gps结果为非GBK编码网页', level = log.INFO)
         return
     try:
         #
         match = re.search(r'''lng: (.*),\r''', data, re.I|re.M)
         if match:
             cmp['GisLongitude'] = match.group(1)
         #
         match = re.search(r'''lat: (.*),\r''', data, re.I|re.M)
         if match:
             cmp['GisLatitude'] = match.group(1)
         yield cmp
     except:
         yield cmp
Ejemplo n.º 11
0
 def process_item(self, item, spider):
     for data in item:
         if not data:
             raise DropItem("Missing {0}!".format(data))
         self.collection.update({'url': item['url']}, dict(item), upsert=True)
         log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider)
     return item
Ejemplo n.º 12
0
def spidercls_for_request(spidermanager, request, default_spidercls=None,
                          log_none=False, log_multiple=False):
    """Return a spider class that handles the given Request.

    This will look for the spiders that can handle the given request (using
    the spider manager) and return a Spider class if (and only if) there is
    only one Spider able to handle the Request.

    If multiple spiders (or no spider) are found, it will return the
    default_spidercls passed. It can optionally log if multiple or no spiders
    are found.
    """
    snames = spidermanager.find_by_request(request)
    if len(snames) == 1:
        return spidermanager.load(snames[0])

    if len(snames) > 1 and log_multiple:
        log.msg(format='More than one spider can handle: %(request)s - %(snames)s',
                level=log.ERROR, request=request, snames=', '.join(snames))

    if len(snames) == 0 and log_none:
        log.msg(format='Unable to find spider that handles: %(request)s',
                level=log.ERROR, request=request)

    return default_spidercls
Ejemplo n.º 13
0
 def parse_follow_page(self,new_id,category,user):
     # category - 'followees' or 'follower'
     user[category] = []
     self.driver.get('http://m.zhihu.com/people/%s/%s' % (new_id,category))
     try:
         follow_list = self.driver.find_element_by_css_selector('.zh-general-list')
     except:
         if settings.DEBUG_INFO : log.msg("no %s for %s" % (category,new_id),level=log.INFO)
         return
     script_content = str(follow_list.get_attribute('data-init'))
     _xsrf = str(self.driver.find_element_by_name('_xsrf').get_attribute('value'))
     post_data = ast.literal_eval(script_content)
     post_data['_xsrf'] = _xsrf
     post_data['method'] = 'next'
     del post_data['nodename']
     
     page = 1
     while True:
         print page
         post_data['params']['offset']=(page-1)*settings.FOLLOW_PER_PAGE
         follow_res = self.block_ajax_load(settings.AJAX_URL[category],"post",post_data)
         if not follow_res:
             break
         ex = r'"http://www\.zhihu\.com/people/([^"]+)"'
         extract_ids = re.findall(ex,follow_res)
         if len(extract_ids) == 0:
             break
         for followee_id in extract_ids:
             user[category].append(re.sub(r'\\','',followee_id))
         page += 1
Ejemplo n.º 14
0
 def ip138_parse(self, response):
     """
     解析ip138上的信息
     """
     detail_list = response.meta["detail_list"]
     num = response.meta["num"]
     item = UnispiderItem() 
     try:
         assert response.status == 200, "error in ip138_parse assert"
         sel = response.selector
         district_code = sel.xpath(u"//td[text()='区 号']/following-sibling::td[1]/text()").extract()
         detail_list = for_ominated_data(detail_list,district_code)   #区号
         fox_code = sel.xpath(u"//td[text()='邮 编']/following-sibling::td[1]/text()").extract()
         detail_list = for_ominated_data(detail_list,fox_code)   #邮编
         local = sel.xpath(u"//td[text()='卡号归属地']/following-sibling::td[1]/text()").extract()
         html_parse = HTMLParser.HTMLParser()
         #local = html_parse.unescape(local)  #解析html中的空格
         try:
             local = local[0].strip()
             local = local.split(html_parse.unescape(" ")) #应老马要求将归属地拆开
         except Exception,e:
             log.msg("归属地_errror %"%"|".join(detail_list),level=log.ERROR) 
         detail_list = for_ominated_data(detail_list, local)#卡号归属地
         try:
             item["content"] = "|".join(detail_list)
             yield item
         except Exception,e:
             log.msg("error detail_list join num=%s, info=%s" %(num, "\001".join(detail_list)))
    def parse(self, response):
        # We'll be given a page here and will have to return
        # all (up to 12) book items

        # let's get the category first
        pgCat = response.xpath('//h2[@id="s-result-count"]/span/a/text()').extract()
        if len(pgCat) == 0: 
            # sometimes it isn't wrapped in a span
            pgCat = response.xpath('//h2[@id="s-result-count"]/a/text()').extract()
            if len(pgCat) == 0:
                log.msg("Couldn't parse base categories: " + response.url, level=log.WARNING)
        pgCat = ".".join(pgCat)
        try:
            pgCat = pgCat + "." + response.xpath('//h2[@id="s-result-count"]/span/span/text()').extract()[0]
        except IndexError:
            try:
                pgCat = pgCat + "." + response.xpath('//h2[@id="s-result-count"]/span/text()').extract()[0]
            except IndexError:
                log.msg("Couldn't parse category title: " + response.url, level=log.WARNING)

        for result in response.xpath('//div[starts-with(@id, "result_")]'):
            baseLink = result.xpath('div[@class="data"]/h3/a')
            url = baseLink.xpath('@href').extract()[0]
            try:
                name = baseLink.xpath('text()').extract()[0]
            except IndexError:
                name = baseLink.xpath('span[@title]').extract()[0]
            retVal = AmazonBookOverviewItem()
            retVal['url'] = url
            retVal['name'] = name
            retVal['category'] = pgCat
            yield retVal
Ejemplo n.º 16
0
 def media_failed(self, failure, request, info):
     if not isinstance(failure.value, IgnoreRequest):
         referer = request.headers.get('Referer')
         msg = 'Image (unknown-error): Error downloading %s from %s referred in <%s>: %s' \
                 % (self.MEDIA_NAME, request, referer, str(failure))
         log.msg(msg, level=log.WARNING, spider=info.spider)
     raise ImageException
 def __init__(self, *args, **kwargs):
     msg = "Django settings used: %s" % os.environ.get("DJANGO_SETTINGS_MODULE")
     log.msg(msg, log.INFO)
     
     super(DjangoBaseSpider,  self).__init__(None, **kwargs)
     
     self._check_mandatory_vars()
Ejemplo n.º 18
0
    def __init__(self):
        """ Constructor """
        # Configure the connection
        self.configure()

        if self.config['replica_set'] is not None:
            connection = MongoReplicaSetClient(
                self.config['uri'],
                replicaSet=self.config['replica_set'],
                w=self.config['write_concern'],
                fsync=self.config['fsync'],
                read_preference=ReadPreference.PRIMARY_PREFERRED)
        else:
            # Connecting to a stand alone MongoDB
            connection = MongoClient(
                self.config['uri'],
                fsync=self.config['fsync'],
                read_preference=ReadPreference.PRIMARY)

        # Set up the collection
        database = connection[self.config['database']]
        self.collection = database[self.config['collection']]
        log.msg('Connected to MongoDB {0}, using "{1}/{2}"'.format(
            self.config['uri'],
            self.config['database'],
            self.config['collection']))

        # Ensure unique index
        if self.config['unique_key']:
            self.collection.ensure_index(self.config['unique_key'], unique=True)
            log.msg('Ensuring index for key {0}'.format(
                self.config['unique_key']))
Ejemplo n.º 19
0
  def parse(self, response):
    print 'do parse function'
    if response.body.find('feedBackUrlCallBack') != -1:
      data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1))
      userinfo = data.get('userinfo', '')
      if len(userinfo):
        log.msg('user id %s' % userinfo['userid'], level=log.INFO)
        assert userinfo['userid'] == self.username
        self.logined = True

        bootstrap = settings.get('BOOTSTRAP')
        log.msg('bootstrap from %s' % bootstrap, level=log.INFO)
        # FIXME: use last scheduled time instead of today, otherwise queue filter will not work
        today = datetime.now()
        if bootstrap == 'file':
          lines = tuple(codecs.open('items.txt', 'r', 'utf-8'))
          for line in lines:
            if line.startswith("#"):
              continue
            start = _epoch()
            url = QueryFactory.create_timerange_query(urllib.quote(line.encode('utf8')), start, today)
            request = Request(url=url, callback=self.parse_weibo, meta={
              'query': line,
              'start': start.strftime("%Y-%m-%d %H:%M:%S"),
              'end': today.strftime("%Y-%m-%d %H:%M:%S"),
              'last_fetched': today.strftime("%Y-%m-%d %H:%M:%S")})
            yield request
      else:
        self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))
Ejemplo n.º 20
0
    def parse(self, response):
        log.msg(response.url)
        hxs = HtmlXPathSelector(response)
        items=[]
        variants_date=hxs.select("//span[@class='normal']//text()").extract()
        variants_price=hxs.select("//table[@id='objContPreviousPrices_grdPreviousPrices']//tr//td[@class='normal']//text()").extract()

        price_items=self.__group_iter(variants_price,4)
        av_price=[]
        for price_list in price_items:
             av_price.append(reduce(lambda x, y: float(x) + float(y) / float(len(price_list)), price_list, 0))
        for price, date in zip(variants_price, variants_date):
            item = BillionPricesIndiaItem()
            quantity='1 lt'
            item['date'] = date
            item['vendor'] = "ioc"
            item['product'] = "gasoline"
            item['category'] = "oil and gas"

            value,measure,unitprice=self.__unit_price(price,quantity)
            item['price'] = price
            item['quantity'] = value
            item['measure']= measure
            item['unitprice']=unitprice


            items.append(item)
        return items
Ejemplo n.º 21
0
    def dl_success(self, response, request, item, spider):

        referer = request.headers.get('Referer')

        if response.status != 200:
            msg = ('{cls}: Got ({status}) downloading {request} referred in '
                   '{referer}'.format(cls=self.__class__.__name__,
                                      status=response.status,
                                      request=request, referer=referer))
            raise TrackPipelineDropItem(msg)

        if not response.body:
            msg = ('Empty response body: {request} referred in '
                   '<{referer}>'.format(request=request, referer=referer))
            raise TrackPipelineDropItem(msg)

        log.msg(format='Downloaded: %(request)s referred in <%(referer)s>',
                level=log.DEBUG, spider=spider, request=request,
                referer=referer)

        spider.crawler.stats.inc_value(
            '{}/file_download_count'.format(self.__class__.__name__),
            spider=spider)

        key_name = '{}/track.mp3'.format(item['track_id'])

        dfd = threads.deferToThread(self.s3.set_from_string, key_name,
                                    response.body)
        dfd.addCallback(self.ul_success, item, spider)

        return dfd
Ejemplo n.º 22
0
 def _sent_failed(self, failure, to, cc, subject, nattachs):
     errstr = str(failure.value)
     log.msg(format='Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
                    'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
                    '- %(mailerr)s',
             level=log.ERROR, mailto=to, mailcc=cc, mailsubject=subject,
             mailattachs=nattachs, mailerr=errstr)
Ejemplo n.º 23
0
    def process_item(self, item, spider):
        info = dict(item)
        # 需要再研究下python list与str的转码的关系,这里的中文给我弄晕了= =
        jsonstr = json.dumps(info).decode('unicode_escape').replace(" ", "").replace("\n", "")
        log.msg(jsonstr)
        self.file.write(
            "%s,%s,%s\n" % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), jsonstr, str(jsonstr == self.result)))
        if (item['is_text'] == ''):
            return item
        obj = json.loads(jsonstr)
        """2016-11-1 22:30:00 官网修改字段 新款-经批准后发售 为 新款,按钮仍为disable
        更换为依据按钮属性判断
        """
        if (obj["submit_button"] != ["disabled"]):
            self.mail_sender.send_mail(jsonstr)
            self.file.write(
                "sendmail!")
        # print info["submit_button"] == [n.encode("utf-8") for n in "disabled"] 这里为什么不相等呢?


        # if (jsonstr != self.result):
        #     self.mail_sender.send_mail(jsonstr)
        #     self.file.write(
        #         "sendmail!")
        return item
Ejemplo n.º 24
0
    def process_request(self, request, spider):
        if spider.to_be_killed:
            log.msg("Spider has been killed, ignoring request to %s" % request.url, log.DEBUG, spider=spider)
#            raise IgnoreRequest()
            return request
        else:
            return None
Ejemplo n.º 25
0
    def parse_synonyms(self, sel):
        """
        This function scrapes the list of Names and Identifiers
        :param sel: a Selector object of the whole page
        :return: a list of Requests
        """
        requests = []
        synonyms = []

        # Exact type for this is unknown, but equivalent to Validated by Expert
        for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
            name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
            synonyms.append(self.new_synonym(syn, name, 'expert'))
        # These synonyms are labeled by ChemSpider as "Validated by Experts"
        for syn in sel.xpath('//p[@class="syn"][strong]'):
            name = syn.xpath('strong/text()').extract()[0]
            synonyms.append(self.new_synonym(syn, name, 'expert'))
        # These synonyms are labeled by ChemSpider as "Validated by Users"
        for syn in sel.xpath(
                '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
            name = syn.xpath(
                'span[@class="synonym_confirmed"]/text()').extract()[0]
            synonyms.append(self.new_synonym(syn, name, 'user'))
        # These syonyms are labeled as "Non-validated" and assumed unreliable
        for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
            name = syn.xpath('span[@class=""]/text()').extract()[0]
            synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))

        # [TODO] - confirm if English User-Validated synonyms are OK too
        for syn in synonyms:
            if syn['category'] == 'expert' and syn['language'] == 'English':
                log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
                self._spider.get_synonym_requests(syn['name'])

        return requests
Ejemplo n.º 26
0
 def _debug_set_cookie(self, response, spider):
     if self.debug:
         cl = response.headers.getlist("Set-Cookie")
         if cl:
             msg = "Received cookies from: %s" % response + os.linesep
             msg += os.linesep.join("Set-Cookie: %s" % c for c in cl)
             log.msg(msg, spider=spider, level=log.DEBUG)
Ejemplo n.º 27
0
def import_to_ckan(created_files):
    importer = CKANImporter()
    for f in created_files:
        m = 'Importing %s' % str(f)
        log.msg(m, level=log.DEBUG)
        importer.import_package(f['archivo'], f['modalidad'])
        log.msg("Paso el importer", level=log.DEBUG)
Ejemplo n.º 28
0
    def parse_parts2(self, response):
        log.msg("\tparse_parts time: %s" % int(time.time()), level=log.DEBUG)
        ua = response.request.headers['User-Agent']
        log.msg("\tua: %s" % ua, level=log.DEBUG)

        for part in response.css('table.parts > tbody > tr'):
            il = ItemLoader(item=CarPart(), selector=part)
            il.add_xpath('shop_city', "td[@class='shop']/a/text()")
            il.add_xpath('shop_name', "td[@class='shop']/a/strong/text()")

            shop_url = il.get_xpath("td[@class='shop']/a/@href", TakeFirst())
            photo_url = il.get_xpath("td[@class='photo']/a/@href", TakeFirst())
            il.add_value('shop_url', urljoin(self.main_url, shop_url))
            il.add_value('ext_link', urljoin(self.main_url, photo_url))

            il.add_xpath('info', "td[@class='info']//text()")
            il.add_xpath('price', "td[@class='price']//text()")

            il.add_value('brand', response.meta.get('brand'))
            il.add_value('model', response.meta.get('model'))
            il.add_value('car_part', response.meta.get('car_part'))
            il.add_value('category', response.meta.get('category'))

            item = il.load_item()
            if item.is_valid():
                yield item
Ejemplo n.º 29
0
 def _debug_cookie(self, request, spider):
     if self.debug:
         cl = request.headers.getlist("Cookie")
         if cl:
             msg = "Sending cookies to: %s" % request + os.linesep
             msg += os.linesep.join("Cookie: %s" % c for c in cl)
             log.msg(msg, spider=spider, level=log.DEBUG)
Ejemplo n.º 30
0
    def __init__(self):
        run_time = str(time.strftime("%y%m%d%H%M%s"))

        result_table = run_time + '_result'
        self.result_table = result_table
        log.msg("Scraped data will store in table: %s" % result_table, level=log.INFO)
        
        self.conn = self.db_conn
        self.cursor = self.conn.cursor()

        #-- init result_table
        self.cursor.execute( " CREATE TABLE " + result_table + """(
                id             int              AUTO_INCREMENT,
                locality       varchar(255),
                price          FLOAT,
                size           FLOAT,
                date           date, 
                city           varchar(255), 
                rent_buy_new   varchar(255), 
                item_link      text, 
                city_link      text, 
                total          varchar(255), 
                PRIMARY KEY (id)
            ) """
        )
        self.conn.commit()
Ejemplo n.º 31
0
    def process_item(self, item, spider):
        address = item['address'] or ''
        if not address or address == 'ABSCONDED':
            log.msg('Item has no address, skip geocode', level=log.WARNING)
            return item
        log.msg('Geocoding address: "%s"' % address)

        if self.geocoder_cache.has_key(str(address)):
            log.msg('Geolocation found in cache, using')
            loc = self.geocoder_cache.get(str(address))
        else:
            try:
                geo_response = self.geocoder.geocode(address)
                log.msg('Location found')
                log.msg(str(geo_response), level=log.DEBUG)
                loc = {
                    'address': geo_response.address,
                    'latitude': geo_response.latitude,
                    'longitude': geo_response.longitude
                }
            except:
                log.msg('GEOCODING ERROR', level=log.ERROR)
                return item
        item['address'] = loc['address']
        item['lat'] = loc['latitude']
        item['lng'] = loc['longitude']
        log.msg('Writing geolocation object to cache')
        log.msg(str(loc), level=log.DEBUG)
        self.geocoder_cache[str(address)] = loc
        # self.geocoder_cache.sync()
        return item
Ejemplo n.º 32
0
 def open(self, spider):
     log.msg('Starting frontier', log.INFO)
     if not self.frontier.manager.auto_start:
         self.frontier.start()
Ejemplo n.º 33
0
logger = logging.getLogger('scrapy')
>>>>>>> mongodb-branch

from IPython.core.debugger import Tracer

class RandomProxy(object):
    def __init__(self, settings):
        # Tracer()()
        self.proxy_list = settings.get('PROXY_LIST')
<<<<<<< HEAD
        fin = open(self.proxy_list)

        self.proxies = {}
        if len(fin.readlines()) == 0:
            Tracer()()
            log.msg('The proxy_list is empty')
            return
        for line in fin.readlines():
            parts = re.match('(\w+://)(\w+:\w+@)?(.+)', line)

            if parts is None:
                Tracer()()
                log.msg('Did not read the line')
                return

            # Cut trailing @
            if parts.group(2):
                user_pass = parts.group(2)[:-1]
            else:
                user_pass = ''
Ejemplo n.º 34
0
    def media_downloaded(self, response, request, info):
        """
            Handler for success downloads.
        """

        referer = request.headers.get('Referer')

        if response.status != 200:
            log.msg(
                format=
                '%(medianame)s (code: %(status)s): Error downloading %(medianame)s from %(request)s referred in <%(referer)s>',
                level=log.WARNING,
                spider=info.spider,
                medianame=self.MEDIA_NAME,
                status=response.status,
                request=request,
                referer=referer)
            raise BookFileException(request.url,
                                    '%s: download-error' % (request.url, ))

        if not response.body:
            log.msg(
                format=
                '%(medianame)s (empty-content): Empty %(medianame)s from %(request)s referred in <%(referer)s>: no-content',
                level=log.WARNING,
                spider=info.spider,
                medianame=self.MEDIA_NAME,
                request=request,
                referer=referer)
            raise BookFileException(request.url,
                                    '%s: empty-content' % (request.url, ))

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        log.msg(
            format=
            '%(medianame)s (%(status)s): Downloaded %(medianame)s from %(request)s referred in <%(referer)s>',
            level=log.DEBUG,
            spider=info.spider,
            medianame=self.MEDIA_NAME,
            status=status,
            request=request,
            referer=referer)

        if self.is_valid_content_type(response):
            raise BookFileException(
                request.url, '%s: invalid-content_type' % (request.url, ))

        filename = self.get_file_name(request, response)

        if not filename:
            raise BookFileException(request.url,
                                    '%s: noaccess-filename' % (request.url, ))

        self.inc_stats(info.spider, status)

        try:
            key = self.file_key(
                request.url)  #return the SHA1 hash of the file url
            book_file_id, checksum = self.store.persist_file(
                key, response.body, info, filename)
        except BookFileException as exc:
            whyfmt = '%(medianame)s (error): Error processing %(medianame)s from %(request)s referred in <%(referer)s>: %(errormsg)s'
            log.msg(format=whyfmt,
                    level=log.WARNING,
                    spider=info.spider,
                    medianame=self.MEDIA_NAME,
                    request=request,
                    referer=referer,
                    errormsg=str(exc))
            raise

        return {
            'url': request.url,
            'book_file_id': book_file_id,
            'checksum': checksum
        }
Ejemplo n.º 35
0
 def close_spider(self, spider, reason):
     if self._dump:
         log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self.get_stats()), \
             spider=spider)
     self._persist_stats(self.get_stats(), spider)
Ejemplo n.º 36
0
 def process_response(self, request, response, spider):
     print('catch url now: %s' % response.url)
     if response.status == 418:
         # 如果收到 418,重新发送这个请求
         if request.meta['repeat_times'] < self.__repeat_times:
             request.meta['repeat_times'] = request.meta['repeat_times'] + 1
             log.msg(message=time.strftime(
                 "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") +
                     spider.name + ": restart crawl url:" + response.url,
                     level=log.INFO)
             # time.sleep(self.__sleep_time)
             return request
         else:
             # 如果重复爬取次数已经到达上限,标记该response为'stop_catch'并将其返回至
             print(spider.name + " cannot catch this url!")
             log.msg(message=time.strftime(
                 "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") +
                     spider.name +
                     ": having repeating %d times! url:%s. stop catch!" %
                     self.__repeat_times,
                     level=log.INFO)
             request.meta['stop_catch'] = True
             return response
     try:
         # 检查返回的json数据是否为空,若为空则表明该页不存在数据或者是爬取数据失败
         # 根据相应的情况对response对象进行处理
         parse_json = json.loads(response.text)
         if parse_json['ok'] == 0:
             if request.meta['repeat_times'] < self.__repeat_times:
                 request.meta[
                     'repeat_times'] = request.meta['repeat_times'] + 1
                 print(
                     "[WeiboSpiderRetryMiddleware] catch empty json file! retry! url:%s"
                     % request.url)
                 log.msg(
                     message=time.strftime(
                         "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ")
                     +
                     "Middleware: catch empty json file! retry! url:%s, retry times:%d"
                     % (request.url, request.meta['repeat_times']),
                     level=log.INFO)
                 return request
             else:
                 # log.msg(message=time.strftime("%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") +
                 #                 spider.name + " cannot catch this url! url: " + request.url, level=log.INFO)
                 raise IgnoreRequest
         else:
             request.meta['parse_json'] = parse_json
             return response
     except json.JSONDecodeError:
         log.msg(message=time.strftime(
             "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") +
                 "catch html file!",
                 level=log.INFO)
         if request.meta['repeat_times'] < self.__repeat_times:
             request.meta['repeat_times'] = request.meta['repeat_times'] + 1
             print(
                 "[WeiboSpiderRetryMiddleware] catch empty json file! retry! url:%s"
                 % request.url)
             log.msg(
                 message=time.strftime(
                     "%Y-%m-%d %H:%M:%S [WeiboSpiderRetryMiddleware] ") +
                 "Middleware: catch empty json file! retry! url:%s, retry times:%d"
                 % (request.url, request.meta['repeat_times']),
                 level=log.INFO)
             return request
         else:
             print(spider.name + " cannot catch this url! url: " +
                   request.url)
             raise IgnoreRequest
Ejemplo n.º 37
0
 def __init__(self):
     log.msg('Initializing geocoder pipeline')
     self.geocoder = GoogleV3()
     self.geocoder_cache = {}
Ejemplo n.º 38
0
 def process_response(self, request, response, spider):
     '''对返回的response处理'''
     # 如果返回的response状态不是200,重新生成当前request对象
     if response.status != 200:
         log.msg('-' * 10, level=log.ERROR)
         log.msg(response.url, level=log.ERROR)
         log.msg(request.body.encode('utf-8'), level=log.ERROR)
         log.msg(response.status, level=log.ERROR)
         log.msg(request.meta['proxy'], level=log.ERROR)
         log.msg('proxy block!', level=log.ERROR)
         log.msg('-' * 10, level=log.ERROR)
         proxy = self.get_random_proxy()
         # 对当前reque加上代理
         request.meta['proxy'] = 'http://%s' % proxy
         return request
     return response
Ejemplo n.º 39
0
    def process_car(self, cursor, item):
        """ insert & update Cars """

        def process_make(cursor, make):
            """ check if the make & model exist in the _variations, if they do not, inserting it into _hold to manually process later """

            # make
            make = urllib.unquote_plus(make)
            sql = "select id from master_makes_variations use index(idx_make) where make = %s;"
            parameters = (make)
            cursor.execute(sql, parameters)
            result = cursor.fetchone()
            make_id = None
            if not result:
                sql = "select id from master_makes_hold use index (idx_make) where make = %s;"
                parameters = (make)
                cursor.execute(sql, parameters)
                result = cursor.fetchone()
                if not result:
                    sql = "insert into master_makes_hold(make) values (%s);"
                    parameters = (make)
                    cursor.execute(sql, parameters)
                    cursor.execute('commit;')
                    log.msg('[UNFOUND] make - %s' % make, level=log.INFO)
                    return cursor.lastrowid
                else:
                    return None
            else:
                log.msg('[FOUND] make - %s' % make, level=log.INFO)
                return result['id']

        def process_model(cursor, model, make_id):
            """ check if the model exists in the _variations, if it does not, inserting it into _hold to manually process later """            

            model = urllib.unquote_plus(model)

            sql = "select id from master_models_variations use index (idx_model) where model = %s;"
            parameters = (model)
            cursor.execute(sql, parameters)
            result = cursor.fetchone()
            if not result:
                sql = "select id from master_models_hold use index (idx_model) where model = %s;"
                parameters = (model)
                cursor.execute(sql, parameters)
                result = cursor.fetchone()
                if not result:
                    sql = "insert into master_models_hold(model, fk_make) values (%s, %s);"
                    parameters = (model, str(make_id))
                    cursor.execute(sql, parameters)
                    cursor.execute('commit;')
                    log.msg('[UNFOUND] model - %s' % model, level=log.INFO)
            else:
                log.msg('[FOUND] model - %s' % model, level=log.INFO)

        # Check if Car's Vin is existed
        sql = "select RowNum from master_vin use index(Idx_VIN) where VIN = %s limit 1;"
        if item.get('vin') is not None:
            parameters = (item.get('vin'))
        else:
            parameters = ("")
        cursor.execute(sql, parameters)
        result = cursor.fetchone()
        if result:
            # Vin is duplicated, then set target table is _history
            target_table = "_history"
        else:
            # Vin is new, then set target table is _cars
            target_table = "_cars"

        # check if Car's ID is existed
        sql = "".join(("select id from ", item.get('site'), "_cars where id = %s limit 1;"))
        parameters = (item.get('url_id'))
        cursor.execute(sql, parameters)
        result = cursor.fetchone()
        if not result:
            
            # joining site and target_table to choose correct data table and then insert a new Car
            sql = "".join(("insert into ", item.get('site'), target_table ,"(id, description, `year`, make, trim, model, price, bodystyle,\
                exterior_color, interior_color, `engine`, stock_id, vin, mileage, transmission, drive_type, doors, fuel, cab, stereo, dealer, street_number, \
                street_name, city, state, zip_code, phone, source_url, found_by) \
                values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"))
            parameters = (
                item.get('url_id'), item.get('description'), item.get('year'), item.get('make'), item.get('trim'),
                item.get('model'), item.get('price'), item.get('body_style'), item.get('exterior_color'), item.get('interior_color'),
                item.get('engine'), item.get('stock_id'), item.get('vin'), item.get('mileage'), item.get('transmission'), item.get('drive_type'),
                item.get('doors'), item.get('fuel_type'), item.get('cab_type'), item.get('stereo'), item.get('dealer'), item.get('street_number'), 
                item.get('street_name'), item.get('city'), item.get('state'), item.get('zip_code'), item.get('phone'), item.get('source_url'),
                item.get('found_by')

            )
            cursor.execute(sql, parameters)
            log.msg('[ADDED] %s at %s EST' % (item['description'], datetime.now(timezone('US/Eastern'))
                    .strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO)

            # call make post-processing
            make_id = None
            if item.get('make') != "":
                make_id = process_make(cursor, item.get('make'))

            if item.get('model') != "" and make_id is not None:
                process_model(cursor, item.get('model'), make_id)
        else:
            log.msg("[WARNING] Multiple Checking - %s" % item['url_id'], level=log.INFO)    
Ejemplo n.º 40
0
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector

from superdeals.items import SuperdealsItem

import socket

ip = socket.gethostbyname(socket.gethostname())

from scrapy import log

log.msg('ip= ' + ip)


class SuperDealsSpider(BaseSpider):
    """
	Base spider which defines the url's to be scraped
	"""
    name = "superdeal"
    allowed_domains = [
        "http://www.homeshop18.com/", "http://www.flipkart.com",
        "http://www.infibeam.com/", "http://www.tradus.com/",
        "http://www.indiatimes.com"
    ]
    start_urls = [
        "http://www.homeshop18.com/superdeals/",
        "http://www.flipkart.com/offers/electronics",
        "http://www.flipkart.com/offers/fashion",
        "http://www.flipkart.com/offers/books-and-more",
        "http://www.infibeam.com/Hot_Deals/search",
        "http://www.tradus.com/deals", "http://shopping.indiatimes.com/deals/"
Ejemplo n.º 41
0
 def process_request(self, request, spider):
     ua = random.choice(self.user_agent_list)
     if ua:
         log.msg('Current UserAgent: ' + ua, level=log.INFO)
         request.headers.setdefault('User-Agent', ua)
Ejemplo n.º 42
0
    def parse_product(self, response):
        #inspect_response(response, self)
        #return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        tmp = hxs.select('//span[@class="product_code"]/text()').extract()
        if tmp:
            loader.add_value('identifier', tmp[0].strip())
            loader.add_value('sku', tmp[0])
        else:
            log.msg('### No product ID at ' + response.url, level=log.INFO)
            return
        #tmp = hxs.select('//input[@name="productId"]/@value').extract()
        #if tmp:
        #    loader.add_value('sku', tmp[0])
        name = ''
        tmp = hxs.select('//span[@itemprop="name"]/text()').extract()
        if tmp:
            name = tmp[0].strip()
            loader.add_value('name', name)
        else:
            log.msg('### No name at ' + response.url, level=log.INFO)
        #price
        price = 0
        stock = 0
        tmp = hxs.select('//span[@itemprop="price"]/text()').extract()
        if not tmp:
            tmp = hxs.select(
                '//table[@id="product-info-table"]/tr[@class="price"]/td/span[1]/text()'
            ).extract()
        if tmp:
            price = extract_price(tmp[0].strip().replace(',', ''))
            loader.add_value('price', price)
            stock = 1
        #stock
        #stock = 0
        #tmp = hxs.select('//td[strong="In Stock: "]/text()').extract()
        #if tmp and 'yes' in ''.join(tmp).lower():
        #    stock = 1
        loader.add_value('stock', stock)
        #image_url
        tmp = hxs.select('//img[@id="product_photo"]//@src').extract()
        if tmp:
            url = urljoin(response.url, tmp[0].strip())
            loader.add_value('image_url', url)
        #brand
        tmp = hxs.select(
            '//span[@itemprop="description"]//b[1]/text()').extract()
        if tmp:
            loader.add_value('brand', tmp[0].replace('Collection', '').strip())
        #category
        tmp = hxs.select('//div[@class="breadbox"]/div[1]/a/text()').extract()
        if len(tmp) > 1:
            for s in tmp[1:]:
                loader.add_value('category', s)
        #shipping_cost
        if Decimal(price) < 49.95:
            loader.add_value('shipping_cost', '8.95')

        product = loader.load_item()

        options = hxs.select(
            '//table[@id="options_table"]//select/option[@value!="0"]')
        #No options currently.
        if not options:
            if not product.get('identifier', None):
                log.msg('### No product ID at ' + response.url, level=log.INFO)
            else:
                if not product['identifier'] in self.id_seen:
                    self.id_seen.append(product['identifier'])
                    yield product
                else:
                    log.msg('### Duplicate product ID at ' + response.url,
                            level=log.INFO)
            return
        #process options
        for sel in options:  ###
            item = copy.deepcopy(product)
            tmp = sel.select('./text()').extract()
            if tmp:
                item['identifier'] += '-' + tmp[0].replace(' ', '_')
                item['name'] = name + ' - ' + tmp[0]

            if not item.get('identifier', None):
                log.msg('### No product ID at ' + response.url, level=log.INFO)
            else:
                if not item['identifier'] in self.id_seen:
                    self.id_seen.append(item['identifier'])
                    yield item
                else:
                    log.msg('### Duplicate product ID at ' + response.url,
                            level=log.INFO)
Ejemplo n.º 43
0
        sql_insert = """insert into """ + table + """(`url`, `source`, `title`,
                    `time`, `content`, `types`) values (%s, %s, %s, %s, %s, %s)"""
        try:
            cursor.execute(sql_insert,
                           (item['url'], item['source'], item['title'],
                            item['time'], item['content'], item['types']))
            self.conn.commit()
            log.msg("successfully commit url: %s" % item['url'],
                    level=log.INFO)
        except MySQLdb.Error, e:
            print "MySQLdb.Error %d: %s" % (e.args[0], e.args[1])
            self.conn.rollback()
            log.msg("except for DBWriterPipeline", level=log.WARNING)
        finally:
            log.msg("passing DBWriterPipeline, content len=%d" %
                    len(item['content']),
                    level=log.INFO)
            # return item

    def open_spider(self, spider):
        log.msg("call open_spider...", level=log.INFO)
        self.conn = MySQLdb.connect(user='******',
                                    passwd='root',
                                    db='news',
                                    host='localhost',
                                    charset='utf8',
                                    use_unicode=True)

    def close_spider(self, spider):
        log.msg("call close_spider...", level=log.INFO)
        self.conn.close()
Ejemplo n.º 44
0
 def process_request(self, request, spider):
     useragent = self._useragents[spider]
     rp = self.robot_parser(request, spider)
     if rp and not rp.can_fetch(useragent, request.url):
         log.msg("Forbidden by robots.txt: %s" % request, log.DEBUG)
         raise IgnoreRequest
Ejemplo n.º 45
0
    def parse_parliament_steps(self, response):
        """
        Callback function to parse the additional 'Parlamentarisches Verfahren'
        page
        """
        law_item = response.meta['law_item']

        phases = LAW.PHASES.xt(response)

        for phase in phases:
            # Create phase if we don't have it yet
            phase_item, created = Phase.objects.get_or_create(
                title=phase['title'])
            if created:
                log.msg(u"Created Phase {}".format(
                    green(u'[{}]'.format(phase_item.title))))

            # Create steps
            for step in phase['steps']:
                step_item, created = Step.objects.update_or_create(
                    title=step['title']['text'],
                    sortkey=step['sortkey'],
                    date=step['date'],
                    protocol_url=step['protocol_url'][0]
                    if step['protocol_url'] else u'',
                    law=law_item,
                    phase=phase_item,
                    source_link=response.url)
                step_item.save()

                # Save statements for this step, if applicable
                if 'statements' in step['title']:
                    for stmnt in step['title']['statements']:
                        # Find the person
                        pq = Person.objects.filter(
                            source_link__endswith=stmnt['person_source_link'])
                        if pq.exists() and pq.count() == 1:
                            person_item = pq.first()
                            st_data = {
                                'speech_type':
                                stmnt['statement_type'],
                                'protocol_url':
                                stmnt['protocol_link'][0]
                                if stmnt['protocol_link'] else None
                            }

                            st_item, st_created = Statement.objects.update_or_create(
                                index=stmnt['index'],
                                person=person_item,
                                step=step_item,
                                defaults=st_data)
                            # if st_created:
                            #     log.msg(u"Created Statement by {} on {}".format(
                            #         green(
                            #             u'[{}]'.format(person_item.full_name)),
                            #         step_item.date))
                            # else:
                            #     log.msg(u"Updated Statement by {} on {}".format(
                            #         green(
                            #             u'[{}]'.format(person_item.full_name)),
                            #         step_item.date))
                        else:
                            # We can't save statements if we can't find the
                            # Person
                            log.msg(
                                red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}"
                                    ).format(
                                        green(u'[{}]'.format(
                                            stmnt['person_name'])),
                                        blue("[{}]".format(
                                            stmnt['person_source_link'])),
                                        red("{}").format(
                                            "" if pq.exists() else " not"),
                                        "" if pq.count() > 1 else
                                        ", but {} persons matching found!".
                                        format(pq.count())))
                            continue
Ejemplo n.º 46
0
 def close_spider(self, spider):
     log.msg("call close_spider...", level=log.INFO)
     self.conn.close()
Ejemplo n.º 47
0
 def close_spider(self, spider):
     if self.sender:
         log.msg('disconnect zmq')
         self.sender.term()
Ejemplo n.º 48
0
    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        # Extract fields
        ts = GENERIC.TIMESTAMP.xt(response)
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        status = LAW.STATUS.xt(response)

        if not self.IGNORE_TIMESTAMP and not self.has_changes(
                parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"Skipping Law {} of {}, no changes: {}".format(
                    self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title)))
            return

        # Extract foreign keys
        category = LAW.CATEGORY.xt(response)
        description = LAW.DESCRIPTION.xt(response)

        # Create category if we don't have it yet
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # Create and save Law
        law_data = {
            'title': title,
            'status': status,
            'description': description,
            'ts': ts
        }
        law_item, law_created = Law.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            source_link=response.url,
            defaults=law_data)

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = cat
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Log our progress
        if law_created:
            logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}"

        logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER,
                                 red(title), cyan(u"[{}]".format(parl_id)),
                                 green(str(LLP)), blue(response.url))
        log.msg(logtext, level=log.INFO)

        response.meta['law_item'] = law_item

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            self.parse_parliament_steps(response)

        if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'):
            self.parse_pre_parliament_steps(response)
Ejemplo n.º 49
0
 def close_spider(self, spider):
     if self.cnn:
         log.msg('disconnect mongodb')
         self.cnn.close()
         self.cnn = None
Ejemplo n.º 50
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            thumb_url = response.request.meta['thumb']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)
            #print "show_id:    %s" % show_id
            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath(
                '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()'
            ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()'
                ).extract()

            category = response.xpath(
                '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract(
                )
            if not category:
                category = response.xpath(
                    '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()'
                ).extract()
            if not category:
                category = response.xpath(
                    '//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath(
                    '//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()'
                ).extract()

            upload_time = response.xpath(
                '//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/span/text()').extract()

            tag = response.xpath(
                '//span[@id="widget-videotag"]/descendant::*/text()').extract(
                )
            if not tag:
                tag = response.xpath(
                    '//span[@class="mod-tags_item vl-block"]/descendant::*/text()'
                ).extract()
            if not tag:
                tag = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            #if category:
            #    ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority

            if albumid:
                items.append(
                    Request(url=self.playlength_url + albumid[0],
                            callback=self.parse_playlength,
                            meta={
                                'item': ep_item,
                                'albumid': albumid[0]
                            }))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 51
0
    def insert_profile(self, item):
        cursor = self.cursor()

        # udpate crawled_url
        cursor.execute('select sno from crawled_urls use index(url) where url = %s limit 1', (item['profile_url']))
        result = cursor.fetchone()
        if result:
            crawled_urls_sno = result['sno']
        else:
            cursor.execute("insert into crawled_urls (url) values (%s)", (item['profile_url']))
            log.msg(" - [P] %s" % item['profile_url'], level=log.DEBUG)
            crawled_urls_sno = cursor.lastrowid

        cursor.execute('select sno from linkedin_profiles use index(profile_url) where profile_url = %s limit 1', (item['profile_url']))
        result = cursor.fetchone()
        if not result:

            cursor.execute(\
                    "insert into linkedin_profiles (crawled_urls_sno, profile_url, title, first_name, last_name, locality, region,\
                    country, desc_short, profile_pic, num_connection, email, phone, twitter_username, department, recommendations,\
                    im, address, birthday, marital_status, created)\
                    values (%s, %s, %s, %s, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s, %s)",
                    (
                        crawled_urls_sno, item.get('profile_url'), item.get('title'),
                        item.get('first_name'), item.get('last_name'), item.get('locality'),
                        item.get('region'), item.get('country'), item.get('desc_short'),
                        item.get('profile_pic'), item.get('num_connection'), item.get('email'),
                        item.get('phone'), item.get('twitter_username'), item.get('department'),
                        item.get('recomandations'),
                        item.get('im'), item.get('address'), item.get('birthday'),
                        item.get('marital_status'),
                        datetime.now() 
                    )
                )

            profile_sno = cursor.lastrowid

            # education
            for edu in item['education']:
                cursor.execute(\
                        'insert into linkedin_education (profile_sno, date_start, date_end, degree, organization)\
                        values (%s,%s,%s,%s,%s)',
                        (
                            profile_sno, self.string_to_date(edu.get('date_start')), self.string_to_date(edu.get('date_end')),
                            edu.get('degree'), edu.get('organization')
                        )
                    )

            # experience
            for exp in item['experience']:
                cursor.execute(\
                        'insert into linkedin_experience (profile_sno, date_start, date_end, title, organization, description)\
                        values (%s,%s,%s,%s,%s, %s)',
                        (
                            profile_sno, self.string_to_date(exp.get('date_start')), self.string_to_date(exp.get('date_end')),
                            exp.get('title'), exp.get('organization'), exp.get('description')
                        )
                    )
            # skills
            for ski in item['skills']:
                cursor.execute(\
                        'insert into linkedin_skills (profile_sno, skill, no_endorsements, first_skill_ind)\
                        values (%s,%s, %s, %s)',
                        (
                            profile_sno,
                            ski.get('skill'),
                            ski.get('no_endorsements'),
                            ski.get('first_skill_ind'),
                        )
                    )
            # specialties
            for spe in item['specialties']:
                cursor.execute(\
                        'insert into linkedin_specialities (profile_sno, specialty)\
                        values (%s,%s)',
                        (
                            profile_sno,
                            spe.get('specialty', None)
                        )
                    )

            # websites
            for w in item['websites']:
                cursor.execute(\
                        'insert into linkedin_websites (profile_sno, website, cate)\
                        values (%s, %s, %s)',
                        (
                            profile_sno,
                            w.get('website'),
                            w.get('cate')
                        )
                    )
            # interests
            for w in item['interests']:
                cursor.execute(\
                        'insert into linkedin_interests (profile_sno, interest)\
                        values (%s,%s)',
                        (
                            profile_sno,
                            w.get('interest')
                        )
                    )
            # groups
            for w in item['groups']:
                cursor.execute(\
                        'insert into linkedin_groups (profile_sno, group_url, organization)\
                        values (%s,%s,%s)',
                        (
                            profile_sno,
                            w.get('group_url'),
                            w.get('organization')
                        )
                    )
            # honors
            for w in item['honors']:
                cursor.execute(\
                        'insert into linkedin_honors (profile_sno, honor)\
                        values (%s,%s)',
                        (
                            profile_sno,
                            w.get('honor'),
                        )
                    )
            cursor.execute('commit')
            log.msg(" - [Added] profile: %s" % item['profile_url'], level=log.INFO)

        # update crawled_urls
        cursor.execute(\
                'update crawler_urls use index(found_urls) set stat= %s where found_urls = %s limit 1',
                ('C', item['profile_url'])
        )
        log.msg(" - [W->C] %s" % item['profile_url'], level=log.DEBUG)
Ejemplo n.º 52
0
 def close_spider(self, spider):
     if self.cnn:
         log.msg('disconnect mysql')
         self.cur.close()
         self.cnn.close()
         self.cnn = self.cur = None
Ejemplo n.º 53
0
 def process_request(self, request, spider):
     user_agent = random.choice(self.user_agent_list)
     if user_agent:
         log.msg('Current UserAgent:' + user_agent)
         request.headers.setdefault('User-Agent', user_agent)
Ejemplo n.º 54
0
    def update_profile(self, item):
        #print item
        cursor = self.cursor()

        cursor.execute('select sno from linkedin_profiles use index(profile_url) where profile_url = %s limit 1', (item['profile_url']))
        result = cursor.fetchone()
        if not result:
            #print "insert mode"
            return self.insert_profile(item)
        profile_sno = result['sno']
        # Updating
        sql = 'update linkedin_profiles set '
        for key in item.keys():
            if isinstance(item[key], list):
                continue
            sql += '%s = "%s", ' %( key, item[key])
        #sql = sql[:len(sql) -2]

        sql += ' updated = "%s"' % datetime.now() 
        
        sql += ' where sno = %s limit 1' % profile_sno
        MySQLdb.escape_string(sql)

        cursor.execute(sql)
        cursor.execute('commit')

        # education
        cursor.execute('delete from linkedin_education where profile_sno = %s', profile_sno)
        for edu in item['education']:
            cursor.execute(\
                    'insert into linkedin_education (profile_sno, date_start, date_end, degree, organization)\
                    values (%s,%s,%s,%s,%s)',
                    (
                        profile_sno, self.string_to_date(edu.get('date_start')), self.string_to_date(edu.get('date_end')),
                        edu.get('degree'), edu.get('organization')
                    )
                )

        # experience
        cursor.execute('delete from linkedin_experience where profile_sno = %s', profile_sno)
        for exp in item['experience']:
            cursor.execute(\
                    'insert into linkedin_experience (profile_sno, date_start, date_end, title, organization, description)\
                    values (%s,%s,%s,%s,%s, %s)',
                    (
                        profile_sno, self.string_to_date(exp.get('date_start')), self.string_to_date(exp.get('date_end')),
                        exp.get('title'), exp.get('organization'), exp.get('description')
                    )
                )
        # skills
        cursor.execute('delete from linkedin_skills where profile_sno = %s', profile_sno)
        for ski in item['skills']:
            cursor.execute(\
                    'insert into linkedin_skills (profile_sno, skill, no_endorsements, first_skill_ind)\
                    values (%s,%s, %s, %s)',
                    (
                        profile_sno,
                        ski.get('skill'),
                        ski.get('no_endorsements'),
                        ski.get('first_skill_ind'),
                    )
                )
        # specialties
        cursor.execute('delete from linkedin_specialities where profile_sno = %s', profile_sno)
        for spe in item['specialties']:
            cursor.execute(\
                    'insert into linkedin_specialities (profile_sno, specialty)\
                    values (%s,%s)',
                    (
                        profile_sno,
                        spe.get('specialty', None)
                    )
                )
        # websites
        cursor.execute('delete from linkedin_websites where profile_sno = %s', profile_sno)
        for w in item['websites']:
            cursor.execute(\
                    'insert into linkedin_websites (profile_sno, website, cate)\
                    values (%s,%s, %s)',
                    (
                        profile_sno,
                        w.get('website'),
                        w.get('cate')
                    )
                )
        # interests
        cursor.execute('delete from linkedin_interests where profile_sno = %s', profile_sno)
        for w in item['interests']:
            cursor.execute(\
                    'insert into linkedin_interests (profile_sno, interest)\
                    values (%s,%s)',
                    (
                        profile_sno,
                        w.get('interest')
                    )
                )
        # groups
        cursor.execute('delete from linkedin_groups where profile_sno = %s', profile_sno)
        for w in item['groups']:
            cursor.execute(\
                    'insert into linkedin_groups (profile_sno, group_url, organization)\
                    values (%s,%s,%s)',
                    (
                        profile_sno,
                        w.get('group_url'),
                        w.get('organization')
                    )
                )
        # honors
        cursor.execute('delete from linkedin_honors where profile_sno = %s', profile_sno)
        for w in item['honors']:
            cursor.execute(\
                    'insert into linkedin_honors (profile_sno, honor)\
                    values (%s,%s)',
                    (
                        profile_sno,
                        w.get('honor'),
                    )
                )

        cursor.execute('commit')           
        log.msg(" - [Updated] profile: %s" % item['profile_url'], level=log.INFO)

        log.msg(" - [Updated] crawled_urls: %s" % item['profile_url'], level=log.DEBUG)
Ejemplo n.º 55
0
 def process_item(self, item, spider):
     self.db[self.collection_name].insert(dict(item))
     log.msg("Movie added to MongoDB database!",
             level=log.DEBUG,
             spider=spider)
     return item
Ejemplo n.º 56
0
    def parse_Data(self, response):

        log.msg('Going to parse data for URL: %s' % response.url[20:],
                level=log.INFO)

        league = response.meta['league']
        jsonResp = json.loads(response.body)
        jsonData = jsonResp['FetchSubcategoryBetgroupGroupingsResult']

        if jsonData == 'null':
            log.msg('Null response for leauge %s at site: %s' %
                    (league, response.url),
                    level=log.ERROR)
            return None

        try:
            jsonEvents = jsonData['scbgg_c'][0]['m_c']
        except (KeyError, TypeError):
            log.msg('No events for league %s with id %s'
                    '. Is jsonData empty?' %
                    (league['name'].encode('utf-8'), league['id']),
                    level=log.ERROR)
            log.msg(jsonData, level=log.ERROR)
            return None

        items = []
        for jsonEvent in jsonEvents:

            l = EventLoader(item=EventItem2(), response=response)
            l.add_value('sport', u'Football')
            l.add_value('bookie', self.name)

            dateTime = jsonEvent['dd']
            l.add_value('dateTime', dateTime)

            eventName = jsonEvent['n']
            if eventName:
                teams = eventName.lower().split(' - ')
                l.add_value('teams', teams)

            # MO prices
            MOdict = {'marketName': 'Match Odds'}
            home_price = draw_price = away_price = None
            for jsonOdd in jsonEvent['ms_c']:
                if jsonOdd['dn'] == u'1':
                    home_price = jsonOdd['os']
                elif jsonOdd['dn'] == u'X':
                    draw_price = jsonOdd['os']
                elif jsonOdd['dn'] == u'2':
                    away_price = jsonOdd['os']
            MOdict['runners'] = [
                {
                    'runnerName': 'HOME',
                    'price': home_price
                },
                {
                    'runnerName': 'DRAW',
                    'price': draw_price
                },
                {
                    'runnerName': 'AWAY',
                    'price': away_price
                },
            ]

            # Add markets
            l.add_value('markets', [
                MOdict,
            ])

            # Load item
            items.append(l.load_item())

        if not items:
            items = None
        return items
Ejemplo n.º 57
0
 def search(self,response):
     log.msg(response.url)
     for url in response.xpath('//li[@class="list_view"]//a/@href').extract():
         if url not in self.detailedCrawled:
             yield scrapy.Request(url, callback=self.detail)
             self.crawledURL.append(url)
Ejemplo n.º 58
0
 def log(self, message, spider, level=log.DEBUG):
     """Log the given messages at the given log level.  Stolen from Spider."""
     # prepend the name of this class to message
     message = '[' + self.__class__.__name__ + '] ' + message
     log.msg(message, spider=spider, level=level)
Ejemplo n.º 59
0
    def parse_Data(self, response):

        log.msg('Going to parse data for URL: %s' % response.url[20:],
                level=log.INFO)

        l = EventLoader(item=EventItem2(), response=response)
        l.add_value('sport', u'Football')
        l.add_value('bookie', self.name)

        dateTime = take_first(
            response.xpath('//div[@id="center_content"]/'
                           'div[@class="coupon_header scrollable"]/'
                           'div[@class="coupon_header_titles"]/'
                           'h4/span/text()').extract())

        l.add_value('dateTime', dateTime)

        eventName = take_first(
            response.xpath('//div[@id="center_content"]/'
                           'div[@class="coupon_header scrollable"]/'
                           'div[@class="coupon_header_titles"]/'
                           'h1/@title').extract())
        if eventName:
            teams = eventName.lower().split(' v ')
            l.add_value('teams', teams)

        # Markets
        mkts = response.xpath(
            '//div[@class="single_markets" or @class="multiple_markets"]/'
            'div[starts-with(@id, "coupon")]')
        allmktdicts = []
        for mkt in mkts:
            marketName = take_first(mkt.xpath('h4/text()').extract())
            mdict = {'marketName': marketName, 'runners': []}
            runners = mkt.xpath(
                'table[not(@class="has_group_date")]/'
                'tbody/tr[not(@class="header")]/td[@class="outcome_td"]')
            for runner in runners:
                runnerName = take_first(
                    runner.xpath('span/@data-outcome_description').extract())
                price = take_first(
                    runner.xpath(
                        'span/a/span[@class="price"]/text()').extract())
                mdict['runners'].append({
                    'runnerName': runnerName,
                    'price': price
                })
            allmktdicts.append(mdict)

        # Do some Betvic specific post processing and formating
        for mkt in allmktdicts:
            if 'Match Betting' in mkt['marketName']:
                mkt['marketName'] = 'Match Odds'
                for runner in mkt['runners']:
                    if teams[0] in runner['runnerName'].lower():
                        runner['runnerName'] = 'HOME'
                    elif teams[1] in runner['runnerName'].lower():
                        runner['runnerName'] = 'AWAY'
                    elif 'Draw' in runner['runnerName']:
                        runner['runnerName'] = 'DRAW'
            elif 'Correct Score - 90 Mins' in mkt['marketName']:
                mkt['marketName'] = 'Correct Score'
                for runner in mkt['runners']:
                    if teams[1] in runner['runnerName'].lower():
                        runner['reverse_tag'] = True
                    else:
                        runner['reverse_tag'] = False

        # Add markets
        l.add_value('markets', allmktdicts)

        # Load item
        return l.load_item()
Ejemplo n.º 60
0
# NYT_INT_HOME = NYInternationalHomeSpider()
# FT = FinancialTimeSpider()
# HBR = HBRSpider()
# HN = HNSpider()
# DISCOVER_MAG = DiscoverMagSpider()
# TC = TechCrunchSpider()

# config init
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()

# crawler init
crawler.crawl(CRAIG)
# crawler.crawl(MIT_TECH)
# crawler.crawl(NYT_HOME)
# crawler.crawl(NYT_INT_HOME)
# crawler.crawl(FT)
# crawler.crawl(HBR)
# crawler.crawl(HN)
# crawler.crawl(DISCOVER_MAG)
# crawler.crawl(TC)

# crawler start
crawler.start()
log.start()
log.msg('Reactor activated...')
reactor.run()
log.msg('Reactor stopped.')