def process_item(self, item, spider): adapter = ItemAdapter(item) category = adapter['sport'] mensSport = ['Baseball', 'Football'] womensSport = ['Softball', 'Field Hockey'] if "Men's" in category: adapter['category'] = "Men" elif "Women's" in category: adapter['category'] = "Women" else: if any(sport in category for sport in mensSport): adapter['category'] = "Men" if any(sport in category for sport in womensSport): adapter['category'] = "Both" else: adapter['category'] = "Women" adapter['sport'] = category.replace("Coaching Staff", "") return item
def _exporter_for_item(self, item, spider): pipe_logger.debug("Getting first and last names") first_name = getattr(spider, "first").strip().lower() last_name = getattr(spider, "last").strip().lower() adapter = ItemAdapter(item) record = adapter['record'] pipe_logger.debug("Opening file") PROJECT_DIR = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) FILE_PATH = os.path.join( PROJECT_DIR, "scrapy_output", '{last}_{first}.json'.format(last=last_name, first=first_name)) f = open(FILE_PATH, 'wb') pipe_logger.debug("Exporting file") exporter = JsonItemExporter(f) exporter.start_exporting() self.exporter_dict['{}'.format(first_name)] = exporter return self.exporter_dict['{}'.format(first_name)]
def close_spider(self, spider): if self.items is not []: for item in self.items: adapter = ItemAdapter(item) columns = adapter.field_names() writer = csv.DictWriter(self.file, fieldnames=columns, restval='', extrasaction='ignore', delimiter=',', quoting=csv.QUOTE_NONNUMERIC, quotechar="\"") if self.file.tell() == 0: writer.writeheader() writer.writerow(adapter.asdict()) self.file.close()
def process_item(self, item, spider): adapter = ItemAdapter(item) html = adapter.get('table_html') if html is not None: i = 1 for table in html.css("table"): path = os.path.join(spider.settings['FILES_STORE'], adapter['out_dir'], 'table-%i.tsv' % i) with open(path, "w") as file: for tr in table.css('tr'): line = "\t".join( tr.css('td span::text').getall()) + "\n" file.write(line) spider.logger.debug('Table Path: %s' % path) i += 1 pass return item
def test_exporter_custom_serializer(self): class CustomItemExporter(BaseItemExporter): def serialize_field(self, field, name, value): if name == 'age': return str(int(value) + 1) else: return super().serialize_field(field, name, value) i = self.item_class(name='John', age='22') a = ItemAdapter(i) ie = CustomItemExporter() self.assertEqual(ie.serialize_field(a.get_field_meta('name'), 'name', a['name']), 'John') self.assertEqual(ie.serialize_field(a.get_field_meta('age'), 'age', a['age']), '23') i2 = {'name': 'John', 'age': '22'} self.assertEqual(ie.serialize_field({}, 'name', i2['name']), 'John') self.assertEqual(ie.serialize_field({}, 'age', i2['age']), '23')
def process_item(self, item, spider): quote_item = ItemAdapter(item).asdict() quote_exists = ( self.db[self.mongo_collection] .find({"quote_content": quote_item["quote_content"]}) .count() ) if quote_exists > 0: raise DropItem("Quote already on the database!") else: logger.debug( "Quote added to the database: {0}".format(quote_item["quote_content"]) ) self.db[self.mongo_collection].insert_one(quote_item) return item
def _get_serialized_fields(self, item, default_value=None, include_empty=None, pre=None, field_filter=None): """Copy from BaseItemExporter """ item = ItemAdapter(item) if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: field_iter = item.field_names() else: field_iter = item.keys() else: if include_empty: field_iter = self.fields_to_export else: field_iter = (x for x in self.fields_to_export if x in item) for field_name in field_iter: k = None if field_filter: if pre is not None: k = pre_join(pre, field_name) if k in field_filter: continue if field_name in item: field_meta = item.get_field_meta(field_name) value = self.serialize_field( field_meta, field_name, item[field_name], pre=k, field_filter=field_filter, ) else: value = default_value yield field_name, value
def process_item(self, item, spider): new_item = WeiXinCnpcNews(title=item.get('title'), author=item.get('author'), pre_title=item.get('pre_title'), \ preview_img_link=item.get('preview_img_link'), pub_time=item.get('pub_time'), \ content=item.get('content'), crawl_time=item.get('crawl_time'), url=item.get('url'), \ categories=item.get('categories'),images_url=str(item.get('image_urls')),images=str(item.get('images'))) adapter = ItemAdapter(item) try: if adapter.get('content'): spider.session.add(new_item) spider.session.commit() else: raise (f"Missing content in {item}") except: spider.session.rollback() return item
def process_item(self, item, spider): ship_image = ItemAdapter(item).asdict() folder = os.getcwd() + f'/ship/{ship_image["Name"]}' if not os.path.exists(folder): os.makedirs(folder) i = 0 for url in ship_image['ImageUrls']: i += 1 path = f'{folder}/{i}.jpeg' if not url.startswith('https://'): url = 'https://robertsspaceindustries.com' + url requests.get(url) r = requests.get(url) with open(path, 'wb') as code: code.write(r.content) log.info( f'Download ship image for {ship_image["Name"]} completed, ' + f'count: {i}.') return item
def insert_finance_info(self, item): sql_statement = '''INSERT INTO tbl_finance_info( company_id, year_period, quarter_period, audited_status, code, eps, bvps, pe, ros, roea, roaa, current_assets, total_assets, liabilities, short_term_liabilities, owner_equity, minority_interest, net_revenue, gross_profit, operating_profit, profit_after_tax, net_profit) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''' adt = ItemAdapter(item) try: self.cursor.execute( sql_statement, (adt.get('companyId'), adt.get('yearPeriod'), adt.get('quarterPeriod'), adt.get('auditedStatus'), adt.get('code'), adt.get('eps'), adt.get('bvps'), adt.get('pe'), adt.get('ros'), adt.get('roea'), adt.get('roaa'), adt.get('currentAssets'), adt.get('totalAssets'), adt.get('liabilities'), adt.get('shortTermLiabilities'), adt.get('ownerEquity'), adt.get('minorityInterest'), adt.get('netRevenue'), adt.get('grossProfit'), adt.get('operatingProfit'), adt.get('profitAfterTax'), adt.get('netProfit'))) except: raise DropItem('Error') finally: self.conn.commit()
async def process_item(self, item, spider): """ 具体的插入操作,可能需要自己重写 :param item: :param spider: :return: """ ad = ItemAdapter(item) download_url = ad["download_url"] file_name = ad["name"] refer = ad["refer"] sql = "insert into test.urls (download_url,file_name,refer) values (:download_url,:file_name,:refer)" logger.debug("执行sql {0}".format(sql)) await self.pool.execute(sql, { "download_url": download_url, "file_name": file_name, "refer": refer }) return item
def __init__(self, item=None, selector=None, response=None, parent=None, **context): if selector is None and response is not None: selector = self.default_selector_class(response) self.selector = selector context.update(selector=selector, response=response) if item is None: item = self.default_item_class() self.context = context self.parent = parent self._local_item = context['item'] = item self._local_values = defaultdict(list) # values from initial item for field_name, value in ItemAdapter(item).items(): self._values[field_name] += arg_to_iter(value)
def process_item(self, item, spider): adapter = ItemAdapter(item) # logging.log(logging.WARNING, adapter.get('user_name')) if adapter.get('user_name') != None: self.user_names = self.user_names + adapter.get('user_name') self.comment_ratings = self.comment_ratings + adapter.get( 'comment_rating') self.comment_dates = self.comment_dates + adapter.get( 'comment_date') self.comment_contents = self.comment_contents + adapter.get( 'comment_content') logging.log(logging.INFO, len(self.user_names)) logging.log(logging.INFO, len(self.comment_ratings)) logging.log(logging.INFO, len(self.comment_dates)) logging.log(logging.INFO, len(self.comment_contents)) return item
def process_item(self, item, spider): # Only handle TopicItems if not isinstance(item, BaseTopicItem): return item adapter = ItemAdapter(item) topic_id = adapter.get('topic_id') self.logger.debug(f'exporting TopicItem (id: {topic_id})') path = prepare_path(base_dir=self.base_dir_path, dirname_template=self.dirname_tmplt, filename_template=self.filename_tmplt, item=item) with path.open('w') as wh: json.dump(adapter.asdict(), wh) self.logger.debug(f'exported TopicItem (id: {topic_id})') return item
def _get_serialized_fields(self, item, default_value=None, include_empty=None): """Return the fields to export as an iterable of tuples (name, serialized_value) """ item = ItemAdapter(item) if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: field_iter = item.field_names() else: field_iter = item.keys() elif isinstance(self.fields_to_export, Mapping): if include_empty: field_iter = self.fields_to_export.items() else: field_iter = ((x, y) for x, y in self.fields_to_export.items() if x in item) else: if include_empty: field_iter = self.fields_to_export else: field_iter = (x for x in self.fields_to_export if x in item) for field_name in field_iter: if isinstance(field_name, str): item_field, output_field = field_name, field_name else: item_field, output_field = field_name if item_field in item: field_meta = item.get_field_meta(item_field) value = self.serialize_field(field_meta, output_field, item[item_field]) else: value = default_value yield output_field, value
def insert_vietstock_price_board_day(self, item): adt = ItemAdapter(item) if adt.get('res') == 'D': tbl = 'tbl_price_board_day' elif adt.get('res') == '1': tbl = 'tbl_price_board_minute' elif adt.get('res') == '60': tbl = 'tbl_price_board_hour' sql_statement = '''INSERT INTO ''' + tbl + '''(code, t, o, h, l, c, v) VALUES (%s, %s, %s, %s, %s, %s, %s)''' try: self.cursor.execute( sql_statement, (adt.get('code'), adt.get('t'), adt.get('o'), adt.get('h'), adt.get('l'), adt.get('c'), adt.get('v'))) except: raise DropItem("Error") finally: self.conn.commit() return item
def item_completed(self, results, item, info): duration_in_seconds = 0 with suppress(KeyError): ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok] if self.is_download_success(item): file_stats = item['files'][0] file = file_stats['path'] url = file_stats['url'] media_file_path = download_path + file if os.path.isfile(media_file_path): logging.info(str("***File {0} downloaded ***".format(file))) if type(item) is LicenseItem: self.upload_license_to_bucket(item, media_file_path) else: duration_in_seconds = self.upload_file_to_storage(file, item, media_file_path, url) else: logging.info(str("***File {0} not downloaded ***".format(item["title"]))) if type(item) is not LicenseItem: item["duration"] = duration_in_seconds return item
def process_item(self, raw_item, spider): item = ItemAdapter(raw_item) title = "<no title>" if "title" in item["lom"]["general"]: title = str(item["lom"]["general"]["title"]) entryUUID = EduSharing.buildUUID(item["response"]["url"] if "url" in item["response"] else item["hash"]) self.insertItem(spider, entryUUID, item) logging.info("item " + entryUUID + " inserted/updated") # @TODO: We may need to handle Collections # if 'collection' in item: # for collection in item['collection']: # if dbItem: # entryUUID = dbItem[0] # logging.info('Updating item ' + title + ' (' + entryUUID + ')') # self.curr.execute("""UPDATE "references_metadata" SET last_seen = now(), last_updated = now(), hash = %s, data = %s WHERE source = %s AND source_id = %s""", ( # item['hash'], # hash # json, # spider.name, # str(item['sourceId']), # )) # else: # entryUUID = self.buildUUID(item['response']['url']) # if 'uuid' in item: # entryUUID = item['uuid'] # logging.info('Creating item ' + title + ' (' + entryUUID + ')') # if self.uuidExists(entryUUID): # logging.warn('Possible duplicate detected for ' + entryUUID) # else: # self.curr.execute("""INSERT INTO "references" VALUES (%s,true,now())""", ( # entryUUID, # )) # self.curr.execute("""INSERT INTO "references_metadata" VALUES (%s,%s,%s,%s,now(),now(),%s)""", ( # spider.name, # source name # str(item['sourceId']), # source item identifier # entryUUID, # item['hash'], # hash # json, # )) return raw_item
def test_exporter_custom_serializer(self): class CustomItemExporter(BaseItemExporter): def serialize_field(self, field, name, value): if name == "age": return str(int(value) + 1) else: return super().serialize_field(field, name, value) i = self.item_class(name="John", age="22") a = ItemAdapter(i) ie = CustomItemExporter() self.assertEqual( ie.serialize_field(a.get_field_meta("name"), "name", a["name"]), "John") self.assertEqual( ie.serialize_field(a.get_field_meta("age"), "age", a["age"]), "23") i2 = {"name": "John", "age": "22"} self.assertEqual(ie.serialize_field({}, "name", i2["name"]), "John") self.assertEqual(ie.serialize_field({}, "age", i2["age"]), "23")
def process_item(self, item, spider): adapter = ItemAdapter(item) category = adapter['sport'] mensSport = [ 'Baseball', 'Basketball', 'Cross Country', 'Football', 'Golf', 'Ice hockey', 'Swimming & Diving', 'Tennis', 'Track & Field', 'Wrestling' ] womensSport = ['Cross Country', 'Track & Field'] if "Men's" in category: adapter['category'] = "Men" elif "Women's" in category: adapter['category'] = "Women" else: if any(sport in category for sport in mensSport): adapter['category'] = "Men" if any(sport in category for sport in womensSport): adapter['category'] = "Both" else: adapter['category'] = "Women" return item
def default(self, o): if isinstance(o, set): return list(o) elif isinstance(o, datetime.datetime): return o.strftime(f"{self.DATE_FORMAT} {self.TIME_FORMAT}") elif isinstance(o, datetime.date): return o.strftime(self.DATE_FORMAT) elif isinstance(o, datetime.time): return o.strftime(self.TIME_FORMAT) elif isinstance(o, decimal.Decimal): return str(o) elif isinstance(o, defer.Deferred): return str(o) elif is_item(o): return ItemAdapter(o).asdict() elif isinstance(o, Request): return f"<{type(o).__name__} {o.method} {o.url}>" elif isinstance(o, Response): return f"<{type(o).__name__} {o.status} {o.url}>" else: return super().default(o)
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('描述1'): content1 = adapter['描述1'] content1_list2 = content1[1] content1_list2_split = content1_list2.split(' ') adapter['区域'] = content1_list2_split[0] adapter['小区'] = content1_list2_split[1] content1_list1 = content1[0] content1_list1_split = content1_list1.split(',') adapter['户型'] = content1_list1_split[0].partition(':')[-1] adapter['面积'] = content1_list1_split[1].partition(' ')[0] adapter['类型'] = content1_list1_split[2].partition(':')[-1] adapter['楼层'] = content1_list1_split[3].partition('(')[0][3:] adapter['总层'] = content1_list1_split[3].partition('(')[-1][1:-2] adapter.pop('描述1') if adapter.get('单价'): adapter['单价'] = re.findall(r'^\d+', adapter['单价'])[0] if adapter.get('更新时间'): adapter['更新时间'] = adapter['更新时间'].partition(':')[-1].strip()[:10] return item
def default(self, o): if isinstance(o, set): return list(o) elif isinstance(o, datetime.datetime): return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT)) elif isinstance(o, datetime.date): return o.strftime(self.DATE_FORMAT) elif isinstance(o, datetime.time): return o.strftime(self.TIME_FORMAT) elif isinstance(o, decimal.Decimal): return str(o) elif isinstance(o, defer.Deferred): return str(o) elif is_item(o): return ItemAdapter(o).asdict() elif isinstance(o, Request): return "<%s %s %s>" % (type(o).__name__, o.method, o.url) elif isinstance(o, Response): return "<%s %s %s>" % (type(o).__name__, o.status, o.url) else: return super().default(o)
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('html'): cleaner = Cleaner(safe_attrs_only=True, safe_attrs={'src', 'alt', 'href', 'title'}) adapter['html'] = cleaner.clean_html(adapter['html']) adapter['html'] = w3lib_cleaner(adapter['html']) if adapter.get('images'): for img in adapter.get('images'): adapter['html'] = adapter['html'].replace( img['url'], img['path']) if adapter.get('h1'): adapter['h1'] = w3lib_html.strip_html5_whitespace(adapter['h1']) if adapter.get('title'): adapter['title'] = w3lib_html.strip_html5_whitespace( adapter['title']) if adapter.get('author'): adapter['author'] = w3lib_html.strip_html5_whitespace( adapter['author']) return item
def process_item(self, item, spider): print('Topic Pipeline') adapter = ItemAdapter(item) if isinstance(item, TopicItem): if self.db['lhub_topic'].find({ 'topic_id': item['topic_id'] }).count() == 0: record_id = self.db['lhub_topic'].insert_one(item).inserted_id print("Topic %d insert %s!" % (item['topic_id'], record_id)) else: print("Topic %d exist!" % item['topic_id']) elif isinstance(item, CommentItem): if self.db['lhub_comment'].find({ 'comment_id': item['comment_id'] }).count() == 0: record_id = self.db['lhub_comment'].insert_one( item).inserted_id print("Comment %d insert %s!" % (item['comment_id'], record_id)) else: print("Comment %d exist!" % item['comment_id']) return item
def process_item(self, raw_item, spider): item = ItemAdapter(raw_item) if "url" in item["license"] and not item["license"][ "url"] in Constants.VALID_LICENSE_URLS: for key in Constants.LICENSE_MAPPINGS: if item["license"]["url"].startswith(key): item["license"]["url"] = Constants.LICENSE_MAPPINGS[key] break if "internal" in item["license"] and ( "url" not in item["license"] or item["license"]["url"] not in Constants.VALID_LICENSE_URLS): for key in Constants.LICENSE_MAPPINGS_INTERNAL: if item["license"]["internal"].casefold() == key.casefold(): # use the first entry item["license"][ "url"] = Constants.LICENSE_MAPPINGS_INTERNAL[key][0] break if "url" in item["license"] and "oer" not in item["license"]: if (item["license"]["url"] == Constants.LICENSE_CC_BY_40 or item["license"]["url"] == Constants.LICENSE_CC_BY_30 or item["license"]["url"] == Constants.LICENSE_CC_BY_SA_30 or item["license"]["url"] == Constants.LICENSE_CC_BY_SA_40 or item["license"]["url"] == Constants.LICENSE_CC_ZERO_10): item["license"]["oer"] = OerType.ALL if "internal" in item["license"] and "oer" not in item["license"]: internal = item["license"]["internal"].lower() if "cc-by-sa" in internal or "cc-0" in internal or "pdm" in internal: item["license"]["oer"] = OerType.ALL if "expirationDate" in item["license"]: item["license"]["expirationDate"] = dateparser.parse( item["license"]["expirationDate"]) if "lifecycle" in item["lom"]: for contribute in item["lom"]["lifecycle"]: if "date" in contribute: contribute["date"] = dateparser.parse(contribute["date"]) return raw_item
def process_item(self, item, spider): adapter = ItemAdapter(item) if 'name' in adapter.keys() and 'price' in adapter.keys(): adapter['name'] = [unidecode(adapter['name'][0])] adapter['price'] = [unidecode(adapter['price'][0])] str_price = adapter['price'][0] str_price = str_price.replace('.', '') str_price = str_price.replace(',', '.') price = float(str_price) if 'cents' in adapter.keys(): cents = adapter['cents'][0] cents = float(cents) price = price + cents / 100 adapter['price'] = price self.col.insert_one(adapter.asdict()) print("Exporter:", adapter["name"], "| ", adapter["price"], "|", adapter['store']) else: raise DropItem(f"{item} without name or price") return item
def process_item(self, item, spider): if item is None: return item if isinstance(item, AnimeItem): logging.critical( f"---Anime Pipeline--- \n\r Anime: {item['title']}") if (item['title']) in self.animes_seen: raise DropItem("Repeated Animes found: %s" % item) else: self.animes_seen.add(item['title']) if "season" not in item: item["season"] = "None" if "aired_end" not in item: item["aired_end"] = None self.insert_db(ItemAdapter(item).asdict()) # self.write_json(dict(item)) return item return item
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) address = [ x.strip() for x in str(itemAdapter.get('address')).split(',') ] if len(address) >= 3: itemAdapter.update({ 'ward': self.extractLocationName(address[-3]), 'district': self.extractLocationName(address[-2]), 'province': self.extractLocationName(address[-1]) }) if str(itemAdapter.get('province')) == "Hà Nội": return itemAdapter.item else: return DropItem( f"Missing field in paper at {itemAdapter.get('source')}") else: return DropItem( f"Missing field in paper at {itemAdapter.get('source')}")
def process_item(self, item, spider): def process_magistrate(item, magistrate, rol): magistrate_list = item.get(magistrate) if magistrate_list: del item[magistrate] return [{"name": name, "rol": rol} for name in magistrate_list] else: return [] item["_id"] = f'{item["source"]}_{item["sentence_id"]}' # judicature magistrate = process_magistrate(item, "magistrate", 'PONENTE') magistrate_av = process_magistrate( item, "magistrate_av", 'ACLARACIÓN DE VOTO') magistrate_apv = process_magistrate( item, "magistrate_apv", 'ACLARACIÓN PARCIAL DE VOTO') magistrate_sv = process_magistrate( item, "magistrate_sv", 'SALVAMENTO DE VOTO') magistrate_spv = process_magistrate( item, "magistrate_spv", 'SALVAMENTO PARCIAL DE VOTO') item["judicature"] = magistrate + \ magistrate_av + magistrate_apv + magistrate_sv + magistrate_spv # Participants participants = [] if item.get("plaintiff"): participants.append( {"name": item.get("plaintiff"), "role": "plaintiff".upper()}) del item["plaintiff"] if item.get("defendant"): participants.append( {"name": item.get("defendant"), "role": "defendant".upper()}) del item["defendant"] if len(participants): item["participants"] = participants self.db[self.collection_name].insert_one(ItemAdapter(item).asdict()) return item