コード例 #1
0
ファイル: pipelines.py プロジェクト: Mustufain/news-crawler
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if (adapter.get('headline') is None or adapter.get('text') is None or
             adapter.get('author') is None or adapter.get('url') is None):
         raise DropItem(f"Missing required fields in {item}")
     else:
         return item
コード例 #2
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if not adapter.get('vinmonopoletProductId'):
         raise DropItem(f"Missing vinmonopoletProductId in {item}")
     if not adapter.get('points'):
         raise DropItem(f"Missing points in {item}")
     return item
コード例 #3
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        if adapter.get('upc'):
            adapter['page_url'] = clean_data(adapter['page_url'])
            adapter['title'] = clean_data(adapter['title'])
            adapter['price'] = clean_data(adapter['price'])
            adapter['image_url'] = clean_data(adapter['image_url'])
            adapter['rating'] = clean_data(adapter['rating'])
            # adapter['description'] = clean_data(adapter['description'])
            adapter['upc'] = clean_data(adapter['upc'])
            adapter['product_type'] = clean_data(adapter['product_type'])
            adapter['price_without_tax'] = clean_data(adapter['price_without_tax'])
            adapter['price_with_tax'] = clean_data(adapter['price_with_tax'])
            adapter['tax'] = clean_data(adapter['tax'])
            adapter['availability'] = clean_data(adapter['availability'])
            adapter['number_of_reviews'] = clean_data(adapter['number_of_reviews'])
            
            if adapter.get('price'):
                # convirtiendo a soles
                adapter['price'] = float(adapter['price']) * self.vat_factor
                adapter['price_without_tax'] = float(adapter['price_without_tax']) * self.vat_factor
                adapter['price_with_tax'] = float(adapter['price_with_tax']) * self.vat_factor
                adapter['tax'] = float(adapter['tax']) * self.vat_factor

            return item
            # return adapter
        else:
            raise DropItem("Missing upc in %s" % item)
コード例 #4
0
    def process_item(self, item, spider):
        video = ItemAdapter(item)

        try:
            subs = yts.get_transcript(
                video.get("video_id"), languages=["zh-CN", "zh-Hans"]
            )
        except (NoTranscriptFound, TranscriptsDisabled):
            raise DropItem(f"Video has no zh subs")

        with self.conn.cursor() as cur:
            try:
                cur.execute(
                    f"INSERT INTO {self.table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                    (
                        video.get("video_id"),
                        video.get("url"),
                        video.get("embed_url"),
                        video.get("name"),
                        video.get("description"),
                        video.get("thumbnail_url"),
                        video.get("channel"),
                        video.get("date_published"),
                        video.get("genre"),
                        json.dumps(subs),
                    ),
                )
                self.conn.commit()
            except psycopg2.errors.UniqueViolation:
                logger.warning("duplicate video")
                self.conn.rollback()
        return item
コード例 #5
0
    def process_item(self, item, spider):
        # Only handle CommentItems
        if not isinstance(item, BaseCommentItem):
            return item

        adapter = ItemAdapter(item)
        topic_id = adapter.get('topic_id', 0)

        if isinstance(item, TopicCompletedItem):
            self.logger.debug(f'exporting CommentItems (id: {topic_id})')

            path = prepare_path(base_dir=self.base_dir_path,
                                dirname_template=self.dirname_tmplt,
                                filename_template=self.filename_tmplt,
                                item=dict(topic_id=topic_id))

            with path.open('w') as wh:
                writer = csv.writer(wh)
                for row in self.comment_item_buffers[topic_id]:
                    writer.writerow(row)

            self.logger.debug(f'exported CommentItems (id: {topic_id})')

        else:
            # Remove excess spaces if comment is not Ascii Art.
            if not adapter.get('is_aa') and not adapter.get('body'):
                adapter['is_aa'] = False
                remove_excess_spaces(item, 'body')

            self.comment_item_buffers[topic_id].store(item)

        return item
コード例 #6
0
ファイル: pipelines.py プロジェクト: odelanit/bigaray
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     site_name_gender_type = spider.name
     site_keys = site_name_gender_type.split('_')
     site_name = site_keys[0]
     site_gender = site_keys[1]
     site_type = site_keys[2]
     try:
         site = Site.objects.get(name=site_name,
                                 gender=site_gender,
                                 type=site_type)
         title = adapter.get('title')
         price = adapter.get('price')
         sale_price = adapter.get('sale_price')
         images = adapter.get('images')
         image_filename = None
         hq_image_filename = None
         if len(images) == 1:
             image_filename = images[0].get('path')
             hq_image_filename = None
         elif len(images) == 2:
             image_filename = images[0].get('path')
             hq_image_filename = images[1].get('path')
         product_link = adapter.get('product_link')
         try:
             product = Product.objects.get(site=site,
                                           product_link=product_link)
             product.price = price
             product.sale_price = sale_price
             product.image_filename = image_filename
             product.hq_image_filename = hq_image_filename
             product.product_link = product_link
             product.save()
             print("Product: {} updated.".format(title))
         except Product.DoesNotExist:
             Product.objects.create(title=title,
                                    price=price,
                                    sale_price=sale_price,
                                    image_filename=image_filename,
                                    hq_image_filename=hq_image_filename,
                                    product_link=product_link,
                                    site=site)
             print("Product: {} added.".format(title))
         except Product.MultipleObjectsReturned:
             products = Product.objects.filter(site=site,
                                               product_link=product_link)
             products.delete()
             print("Multiple object returned and deleted.")
             Product.objects.create(title=title,
                                    price=price,
                                    sale_price=sale_price,
                                    image_filename=image_filename,
                                    hq_image_filename=hq_image_filename,
                                    product_link=product_link,
                                    site=site)
             print("Product: {} added.".format(title))
     except Site.DoesNotExist:
         print("{} does not exist".format(site_name_gender_type))
     return item
コード例 #7
0
 def process_item(self,item,spider):
     adapter=ItemAdapter(item)
     if adapter.get('price'):
         if adapter.get('price_excludes_vat'):
             adapter['price']=adapter['price'] * self.vtr_Factor
         return item
     else:
         raise DropItem(f'Missing price in {item}')
コード例 #8
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if adapter.get('price'):
         if is_cheaper_then_max(adapter.get('price')) and is_available(
                 adapter.get('availability')):
             return item
     else:
         raise DropItem(f"Missing price or item is not available")
コード例 #9
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get("area"):
         area = str(itemAdapter.get("area"))
         itemAdapter.update({"area": self.getSquare(area)})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing field in real estate at {itemAdapter.get('source')}")
コード例 #10
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if adapter.get('price'):
         if is_cheaper_then_max(adapter.get('price')) and is_available(
                 adapter.get('availability')):
             line = json.dumps(ItemAdapter(item).asdict()) + "\n"
             self.file.write(line)
             return item
     else:
         raise DropItem(f"Missing price or item is not available")
コード例 #11
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get("value"):
         value = str(itemAdapter.get("value"))
         area = str(itemAdapter.get("area"))
         itemAdapter.update({"value": self.getPrice(value, area)})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing real estate in paper at {itemAdapter.get('source')}")
コード例 #12
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get('title') and itemAdapter.get('value') and itemAdapter.get('area') \
             and itemAdapter.get('address') and itemAdapter.get('ward') and itemAdapter.get('district') and itemAdapter.get('province') \
             and itemAdapter.get('type') and itemAdapter.get('description') and itemAdapter.get('sellerName') \
             and itemAdapter.get('time') and itemAdapter.get('image'):
         return item
     else:
         return DropItem(
             f"Missing field in real estate at {itemAdapter.get('source')}")
コード例 #13
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        if adapter.get("title"):
            if adapter.get("author"):
                match = self.re.match(r"BY:\u00a0 ([^>]+)", adapter["author"])
                adapter["author"] = match.group(
                    1) if match else adapter["author"]

            return item
        else:
            raise DropItem(f"Missing title in {item}")
コード例 #14
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if self.session.query(Business.id).filter_by(business_id=adapter.get('business_id')).scalar() is None:
         business = Business(title=adapter.get('title'), direct_url=adapter.get('direct_url'), business_id=adapter.get('business_id'),
                         main_img_url=adapter.get('main_img_url'),phone=adapter.get('phone'),email=adapter.get('email'),
                         address=json.dumps(adapter.get('address')),average_rating=adapter.get('average_rating'),review_count=adapter.get('review_count'),
                         categories=adapter.get('categories'), site=adapter.get('site'),schedule=json.dumps(adapter.get('schedule')),
                         description=adapter.get('description'),amenities=adapter.get('amenities'))
         self.session.add(business)
         self.session.flush()
         self.session.commit()
         self.session.close()
     return item
コード例 #15
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     # Don't save item if car was damaged.
     if adapter.get('damaged') is True:
         raise DropItem(f'Dropping - {item} - damaged was True!')
     # Don't save item if car has more that 1 000 000 KMs of mileage.
     elif adapter.get('mileage') and int(adapter.get('mileage')) > 1000000:
         raise DropItem(
             f'Dropping - {item} - mileage is over 1 000 000 KMs!')
     else:
         self.exporter.export_item(item)
         self.file.write(b'\n')
         return item
コード例 #16
0
 def process_item(self, item, spider):
     today = date.today().strftime("%d/%m/%Y")
     yesterday = (date.today() - timedelta(days=1)).strftime("%d/%m/%Y")
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get("time"):
         time = str(itemAdapter.get("time"))
         time = time.replace(" ", "").replace("ngàyđăng:", "").replace(
             "hômnay", today).replace("hômqua", yesterday)
         itemAdapter.update({"time": time})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing field in real estate at {itemAdapter.get('source')}")
コード例 #17
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if isinstance(item, TimeseriesItem) and adapter.get(
             'date') and adapter.get('value'):
         date = adapter['date']
         value = adapter['value']
         obj = models.MacroData(id=spider.name, date=date, value=value)
         self.ldc.update(date)
         self.sess.merge(obj)
         self.sess.commit()
         status.update_last_date(spider.name, self.ldc.get())
     else:
         raise DropItem(f"Invalid data {item}")
     return item
コード例 #18
0
ファイル: pipelines.py プロジェクト: KwonL/YOOX-crawler
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        try:
            adapter["brand"] = self.clean_text(adapter.get("brand"))
            adapter["name"] = self.clean_text(adapter.get("name"))
            adapter["price"] = int(
                float(adapter.get("price").replace(",", "").split()[1]))
            adapter["image"] = adapter.get("image").split("?")[0]
        except Exception:
            DropItem(f"Missing essential properties in {item}")

        self.csv.writerow(adapter.asdict())

        return item
コード例 #19
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     file_name = adapter.get('title') + '.txt'
     dir_name = adapter.get('tag')
     # if not asign any specific tag, will create dir based on novels/*.txt
     if not dir_name:
         dir_name = 'novels'
     if not os.path.exists(dir_name):
         os.makedirs(dir_name)            
     new_path = os.path.join(dir_name, file_name)
     if adapter.get('title'):
         self.file = open(new_path, 'w', encoding='utf-8')
         self.file.write(adapter.get('content') + '\n')
     return item
コード例 #20
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        title = adapter.get('title')
        excerpt = adapter.get('excerpt')

        adapter['title'] = title.replace('\n', '')
        if not excerpt:
            excerpt = ''
        adapter['excerpt'] = excerpt.replace('\n',
                                             '').replace('\t',
                                                         '').replace(' ', '')
        image = adapter.get('image')
        adapter['image'] = image if image is not None else ''
        return item
コード例 #21
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        title = adapter.get('title')
        source = adapter.get('source')
        if source == 'baha':
            title = convert(title, 'zh-cn')

        tokens = list(jieba.cut(title))
        tags = []
        for token in tokens:
            if token not in stopwords:
                tags.append(token)
        adapter['tags'] = tags
        return item
コード例 #22
0
def get_template_kwargs(item: Any) -> Dict[str, Any]:

    adapter = ItemAdapter(item)
    # Prepare Comment ID
    cmt_id = adapter.get('comment_id')
    # Prepare Topic ID
    tpc_id = adapter.get('topic_id')
    tpc_id = tpc_id if tpc_id else get_parent_id(cmt_id)

    frm_id = adapter.get('forum_id')
    frm_id = frm_id if frm_id else get_parent_id(tpc_id)

    kwargs = {'forum_id': frm_id, 'topic_id': tpc_id, 'comment_id': cmt_id}

    return kwargs
コード例 #23
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if adapter.get('ads_job_code'):
         if adapter['ads_job_code'] in self.ids_seen:
             raise DropItem(f"Duplicate item found: {item!r}")
         self.ids_seen.add(adapter['ads_job_code'])
     return item
コード例 #24
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     author = adapter.get("author")
     if author in self.authors:
         return item
     else:
         raise DropItem("Not from wanted author")
コード例 #25
0
ファイル: pipelines.py プロジェクト: odelanit/bigaray
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     product_link = adapter.get('product_link')
     status = adapter.get('status')
     try:
         products = Product.objects.filter(product_link=product_link)
         if status == 404:
             products.delete()
         else:
             for product in products:
                 product.status = status
                 product.save()
     except Product.DoesNotExist:
         logger = logging.getLogger(__name__)
         logger.warning("Product doesn't exist: {}".format(product_link))
     return item
コード例 #26
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        if adapter.get('furnished',False):
            raise DropItem(f"Furnished property: {item!r}")
        else:
            return item
コード例 #27
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get('title') and itemAdapter.get('description'):
         type = self.get_refine_type(str(itemAdapter.get("type")))
         if type == "UNKNOW":
             text = [
                 str(itemAdapter.get('title')) +
                 str(itemAdapter.get('description'))
             ]
             prediction = self.classifier.predict(text)
             type = prediction[0]
         itemAdapter.update({'type': type})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing field in paper at {itemAdapter.get('source')}")
コード例 #28
0
    def _exporter_for_item(self, item):
        adapter = ItemAdapter(item)

        title = adapter.get('title')
        title = str(title)

        pubtime = adapter.get('pubtime')
        #pubtime = str(pubtime)
        pubtime_m = datetime.strftime(pubtime, "%Y_%m_%d")

        source = adapter.get('source')
        source = str(source)

        url = adapter.get("url")
        url = str(url)

        if title not in self.title_to_exporter:
            if source == "artagenda":

                url = url.split("/")
                #urlname = url[-1]_{url[-2]}
                filename = f"{pubtime_m}_{url[-1]}_{source}"
            else:
                if len(title)<55:
                    filename = f"{pubtime_m}_{title}_{source}"
                else:
                    filename = f"{pubtime_m}_{title[:55]}_{source}"

            filename = filename_clean(filename)
            filename = f"{filename}.json"
            #linux
            #filepath = os.path.join(os.path.expanduser('~'),'Desktop/scrapy/articles_dump', filename)

            #win
            filepath = os.path.join(os.path.expanduser('~'), 'PycharmProjects/art_scraper/artscraper/articles_dump', filename )

            print("Saved:   " + filename)
            self.close_exporters() #shutsdowns old
            self.close_files() #ditto
            f = open(filepath, 'wb' ) #open statement
            self.files.append(f) #adds file to files bin
            exporter = JsonItemExporter(f)
            exporter.export_empty_fields=True
            exporter.start_exporting()
            self.title_to_exporter[title] = exporter
        return self.title_to_exporter[title]
コード例 #29
0
    def process_item(self, item, spider):

        adapter = ItemAdapter(item)
        print('In Process Item', adapter.asdict())
        if adapter.get('product_price'):
            return item
        else:
            raise DropItem("Missing price in %s" % item)
コード例 #30
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        content = adapter.get('content')
        url = adapter.get('url')
        path = adapter.get('path')

        if content:
            self.log(f"Processing Item {adapter.get('url')}", logging.INFO)
            content = BeautifulSoup(content, 'html.parser').get_text()
            Path(path).parent.absolute().mkdir(parents=True, exist_ok=True)
            with open(path, 'w', encoding="utf-8") as f:
                f.write(content)
                self.log(f'Saved file {path}', logging.INFO)
            return item
        else:
            raise DropItem(f"Missing content in {url}")