def delete(db, docid): try: db.delete(db[docid]) logging.info("Doc %s is deleted!" % docid) except: logging.warning("Can not delete doc %s in %s" % (docid, db.name)) pass
def copy_couchdb_to_couchbase(fromDb, toDb, batch_size=10000): batch = {} cnt = 0 for doc in couch_util.get_pager(fromDb): del doc['_rev'] batch[doc['_id']] = doc if len(batch) > batch_size: try: toDb.upsert_multi(batch) cnt += len(batch) except TemporaryFailError: logging.warning( "Connection timeout. Try to break and update batch") for key, value in batch.items(): toDb.upsert(key, value) cnt += 1 batch = {} logging.info("Copied %s docs" % cnt) if len(batch) > 0: try: toDb.upsert_multi(batch) cnt += len(batch) except: logging.warning( "Connection timeout. Try to break and update batch") for key, value in batch.items(): toDb.upsert(key, value) cnt += 1 logging.info("Copied %s docs" % cnt) print "Done"
def createOrMergeBatch(db, doc_batch): """ create new or merge with existing in batch. Input is a list of couchdb.Document objects. """ assert type(doc_batch) == list, "Bad input %s" % type(doc_batch) # break down doc_batch if doc_batch too large try: responses = db.update(doc_batch) except: logging.warning( "Error with doc batch of size %s. Try to break it down" % len(doc_batch)) responses = [] for doc in doc_batch: responses.extend(db.update([doc])) for (success, docid, rev_or_exc), doc in zip(responses, doc_batch): if not success: assert type(rev_or_exc) == ResourceConflict if docid == doc["_id"]: continue #same doc, updated twice. logging.info("Merging doc %s with %s" % (doc["_id"], docid)) newDoc = db[docid] if mergeDoc(newDoc, doc): db[docid] = newDoc
def deleteDocsByIds(db, docids): oks = db.remove_multi(docids, quiet=True) key_not_found = 0 for docid in docids: if oks[docid].rc == 0xD: key_not_found += 1 logging.warning("Not found key %s to delete" % docid) logging.info("Deleted %d docs" % (len(docids) - key_not_found))
def getDb(server, dbname, new=False): """ Return a db given server, db. If new is True then delete old db and create new """ if type(server) == str: logging.warning("getDb() with server string is deprecated, please " + "pass a Server object instead") server = couchdb.Server(server) if new: try: server.delete(dbname) except: logging.error('Database %s not found!' % dbname) db = server.create(dbname) else: db = server[dbname] return db
def createOrUpdateBatch(db, doc_batch): """ createOrUpdate in batch. Input is a list of couchdb.Document objects. """ assert type(doc_batch) == list, "Bad input %s" % type(doc_batch) # break down doc_batch if doc_batch too large try: responses = db.update(doc_batch) except: logging.warning( "Error with doc batch of size %s. Try to break it down" % len(doc_batch)) responses = [] for doc in doc_batch: responses.extend(db.update([doc])) failed_docs = [] failed_keys = [] for (success, docid, rev_or_exc), doc in zip(responses, doc_batch): if not success: assert type(rev_or_exc) == ResourceConflict logging.warning(" --- try updating %s" % ` docid `) failed_keys.append(docid) failed_docs.append(doc) existing_docs = getDocsByIds(db, failed_keys) for existing_doc, failed_doc in zip(existing_docs, failed_docs): if existing_doc["_id"] != failed_doc["_id"]: logging.warning("mismatch docid %s != %s" % (existing_doc["_id"], failed_doc["_id"])) continue # Copy _rev so that we can update a new version. failed_doc["_rev"] = existing_doc["_rev"] responses = db.update(failed_docs) num_failed = 0 for (success, docid, exc) in responses: if not success: logging.error('Can not update %s %s' % ( ` docid `, ` exc `)) num_failed += 1 if num_failed: logging.error("%d out of %d updates failed" % (num_failed, len(responses)))
def delete(db, docid): try: db.remove(docid) except NotFoundError: logging.warning("Not found key %s to delete" % docid)
def createOrUpdate(db, docid, doc): try: return db.upsert(docid, doc) except KeyExistsError: logging.warning("Locking currently %s", docid) return False
def main(_): print "Generate template for spider %s" % config.get("spider_name") assert flags.get("output_spider_py") or flags.get("output_dir") # assert flags.get("output_storage_py") output_spider_py = flags.get("output_spider_py") output_storage_py = flags.get("output_storage_py") spider_name = flags.get("spider_name") # spider_id = getMd5(spider_name) db = getSpider(spider_name) tpl_spider = template.Template(open(flags.get("spider_template")).read()) tpl_storage_spider = template.Template( open(flags.get("storage_spider_template")).read()) if not output_spider_py: # get output_spider_py from output_dir/spider_name.py instead. output_spider_py = os.path.join(flags.get("output_dir"), getFileNameFromSpiderName(spider_name)) over_write = flags.get("over_write", 'false') name_module = getFileNameFromSpiderName(spider_name).replace('.py', '') if True: # spec = simplejson.load(open("specs.json")) spec = dict(db) print "Spider spec :" spec["name_module"] = name_module # copy the xpath fields into "xpath" if "xpath" not in spec: spec["xpath"] = {} for field in ('name', 'price', 'category', 'description', 'images', 'canonical', 'base_url', 'brand', 'in_stock', 'guarantee', 'promotion'): if field in spec: spec["xpath"][field] = spec[field] spec['hashtag_all_rule'] = "#" spec['hashtag_no_rule'] = "" if 'allowed_domain' not in spec: spec["allowed_domain"] = getDomain(spec["start_url"]) if 'spider_class' not in spec: spec['spider_class'] = getClassNameFromDomain( spec['allowed_domain']) if 'item_url_pattern' not in spec: spec['item_url_pattern'] = '' if 'follow_link_pattern' not in spec: spec['follow_link_pattern'] = '' if 'all_links_pattern' not in spec: spec['all_links_pattern'] = '' if 'all_links_pattern' in spec and spec['all_links_pattern'] != '': spec['hashtag_no_rule'] = '#' spec['hashtag_all_rule'] = '' if 'type' not in spec or spec['type'] == "": spec['type'] = 'crawl' spec['detail_module'] = SPIDER_TYPE[spec['type']]['detail_module'] spec['detail_class'] = SPIDER_TYPE[spec['type']]['detail_class'] pprint.pprint(spec, indent=4) output_spider = tpl_spider.render(template.Context({'spider': spec})) output_storage = tpl_storage_spider.render( template.Context({'spider': spec})) open(output_storage_py, "w").write(output_storage) msg = "Scraper written to %s" % output_storage_py if over_write == 'true' and os.path.exists( output_spider_py) or not os.path.exists(output_spider_py): open(output_spider_py, "w").write(output_spider) msg += " and %s" % output_spider_py else: logging.warning("Spider file exists: %s", output_spider_py) logging.info(msg)
def check_item(self, item): if item is not None: if 'canonical' in item and item['canonical'] is not None and len( item['canonical']) > 0: prop_canonical = item['canonical'][0] if prop_canonical != item['origin_url']: logging.warning("=======> Item duplicate: " + item['url'] + ", we re-update it") item['origin_url'] = prop_canonical item['url'] = self.add_tracking_code(prop_canonical) if 'name' in item and item['name'] is not None and len( item['name']) > 0: for name in item['name']: if name in BLACK_PRODUCT_NAME: logging.warning( "=======> Item expired because name in black product name: " + name + " at " + item['url']) return None for black_name_content in BLACK_PRODUCT_NAME_CONTENT: if black_name_content.lower() in name.lower(): logging.warning( "=======> Item expired because name contain black product name content: " + name + " at " + item['url']) return None else: logging.warning( "=======> Item expired because invalid name: " + item['url']) return None for black_url in BLACK_PRODUCT_URL: if item['origin_url'] == black_url: logging.warning( "=======> Item expired because name in black product link: " + item['url']) return None if (not 'images' in item and not 'price' in item): logging.warning( "=======> Item expired because invalid images and price: " + item['url']) return None return item