def process_item(self, item, spider): if not isinstance(item, HotelItem): return item for k, v in item.iteritems(): if k == 'phone': if v and len(v): v = helper.SEL_RE_PHONE_NUMBER.findall(v[0]) item[k] = helper.rev_telephone(v[0] if len(v) else u'') elif k == 'page_breadcrumbs': if v and len(v): item[k] = v[:len(v) - 1] if v else [] else: item[k] = [] elif k == 'place': item[k] = self.clean_place(v) else: item[k] = helper.clean_lf(v) LocationEs.save(self.save_item_to_es(item)) return item
def save_item_to_es(self, item): item_es = {} item_es['name_low'] = item['name'].lower().strip() item_es['rating'] = float(item['rating'] or 0) item_es['popularity'] = float(item['popularity'] or 0) item_es['page_url'] = item['page_url'].lower() item_es['page_breadcrumbs'] = item['page_breadcrumbs'] item_es['phone'] = item['phone'] item_es['area1'] = item['page_breadcrumbs'][0].strip() if len(item['page_breadcrumbs']) > 0 else u'' item_es['area2'] = item['page_breadcrumbs'][1].strip() if len(item['page_breadcrumbs']) > 1 else u'' state = helper.CLEAN_STATE.findall(item_es['area2']) if state and len(state): item_es['area2'] = state[0].strip() item_es['area3'] = item['page_breadcrumbs'][2].strip() if len(item['page_breadcrumbs']) > 2 else u'' item_es['area4'] = item['page_breadcrumbs'][3].strip() if len(item['page_breadcrumbs']) > 3 else u'' item_es['area5'] = item['page_breadcrumbs'][4].strip() if len(item['page_breadcrumbs']) > 4 else u'' item_es['region'] = item['region'].strip() item_es['place'] = item['place'] item_es['id'] = LocationEs.get_hash(item_es['page_url']) return item_es