Beispiel #1
0
def delete(db, docid):
    try:
        db.delete(db[docid])
        logging.info("Doc %s is deleted!" % docid)
    except:
        logging.warning("Can not delete doc %s in %s" % (docid, db.name))
        pass
Beispiel #2
0
def copy_couchdb_to_couchbase(fromDb, toDb, batch_size=10000):
    batch = {}
    cnt = 0
    for doc in couch_util.get_pager(fromDb):
        del doc['_rev']
        batch[doc['_id']] = doc
        if len(batch) > batch_size:
            try:
                toDb.upsert_multi(batch)
                cnt += len(batch)
            except TemporaryFailError:
                logging.warning(
                    "Connection timeout. Try to break and update batch")
                for key, value in batch.items():
                    toDb.upsert(key, value)
                    cnt += 1
            batch = {}
            logging.info("Copied %s docs" % cnt)
    if len(batch) > 0:
        try:
            toDb.upsert_multi(batch)
            cnt += len(batch)
        except:
            logging.warning(
                "Connection timeout. Try to break and update batch")
            for key, value in batch.items():
                toDb.upsert(key, value)
                cnt += 1
        logging.info("Copied %s docs" % cnt)
    print "Done"
Beispiel #3
0
def createOrMergeBatch(db, doc_batch):
    """ create new or merge with existing in batch.
    
    Input is a list of couchdb.Document objects.
    """
    assert type(doc_batch) == list, "Bad input %s" % type(doc_batch)

    # break down doc_batch if doc_batch too large
    try:
        responses = db.update(doc_batch)
    except:
        logging.warning(
            "Error with doc batch of size %s. Try to break it down" %
            len(doc_batch))
        responses = []
        for doc in doc_batch:
            responses.extend(db.update([doc]))
    for (success, docid, rev_or_exc), doc in zip(responses, doc_batch):
        if not success:
            assert type(rev_or_exc) == ResourceConflict
            if docid == doc["_id"]: continue  #same doc, updated twice.
            logging.info("Merging doc %s with %s" % (doc["_id"], docid))
            newDoc = db[docid]
            if mergeDoc(newDoc, doc):
                db[docid] = newDoc
Beispiel #4
0
def deleteDocsByIds(db, docids):
    oks = db.remove_multi(docids, quiet=True)
    key_not_found = 0
    for docid in docids:
        if oks[docid].rc == 0xD:
            key_not_found += 1
            logging.warning("Not found key %s to delete" % docid)
    logging.info("Deleted %d docs" % (len(docids) - key_not_found))
Beispiel #5
0
def getDb(server, dbname, new=False):
    """ Return a db given server, db.
    If new is True then delete old db and create new 
    """
    if type(server) == str:
        logging.warning("getDb() with server string is deprecated, please " +
                        "pass a  Server object instead")
        server = couchdb.Server(server)
    if new:
        try:
            server.delete(dbname)
        except:
            logging.error('Database %s not found!' % dbname)
        db = server.create(dbname)
    else:
        db = server[dbname]
    return db
Beispiel #6
0
def createOrUpdateBatch(db, doc_batch):
    """ createOrUpdate in batch.
    
    Input is a list of couchdb.Document objects.
    """
    assert type(doc_batch) == list, "Bad input %s" % type(doc_batch)
    # break down doc_batch if doc_batch too large
    try:
        responses = db.update(doc_batch)
    except:
        logging.warning(
            "Error with doc batch of size %s. Try to break it down" %
            len(doc_batch))
        responses = []
        for doc in doc_batch:
            responses.extend(db.update([doc]))
    failed_docs = []
    failed_keys = []
    for (success, docid, rev_or_exc), doc in zip(responses, doc_batch):
        if not success:
            assert type(rev_or_exc) == ResourceConflict
            logging.warning("  ---  try updating %s" % ` docid `)
            failed_keys.append(docid)
            failed_docs.append(doc)
    existing_docs = getDocsByIds(db, failed_keys)
    for existing_doc, failed_doc in zip(existing_docs, failed_docs):
        if existing_doc["_id"] != failed_doc["_id"]:
            logging.warning("mismatch docid %s != %s" %
                            (existing_doc["_id"], failed_doc["_id"]))
            continue
        # Copy _rev so that we can update a new version.
        failed_doc["_rev"] = existing_doc["_rev"]

    responses = db.update(failed_docs)
    num_failed = 0
    for (success, docid, exc) in responses:
        if not success:
            logging.error('Can not update %s %s' % ( ` docid `, ` exc `))
            num_failed += 1
    if num_failed:
        logging.error("%d out of %d updates failed" %
                      (num_failed, len(responses)))
Beispiel #7
0
def delete(db, docid):
    try:
        db.remove(docid)
    except NotFoundError:
        logging.warning("Not found key %s to delete" % docid)
Beispiel #8
0
def createOrUpdate(db, docid, doc):
    try:
        return db.upsert(docid, doc)
    except KeyExistsError:
        logging.warning("Locking currently %s", docid)
        return False
Beispiel #9
0
def main(_):
    print "Generate template for spider %s" % config.get("spider_name")
    assert flags.get("output_spider_py") or flags.get("output_dir")
    #     assert flags.get("output_storage_py")
    output_spider_py = flags.get("output_spider_py")
    output_storage_py = flags.get("output_storage_py")
    spider_name = flags.get("spider_name")
    #     spider_id   = getMd5(spider_name)
    db = getSpider(spider_name)
    tpl_spider = template.Template(open(flags.get("spider_template")).read())
    tpl_storage_spider = template.Template(
        open(flags.get("storage_spider_template")).read())
    if not output_spider_py:
        # get output_spider_py from output_dir/spider_name.py instead.
        output_spider_py = os.path.join(flags.get("output_dir"),
                                        getFileNameFromSpiderName(spider_name))
    over_write = flags.get("over_write", 'false')
    name_module = getFileNameFromSpiderName(spider_name).replace('.py', '')
    if True:
        # spec = simplejson.load(open("specs.json"))
        spec = dict(db)
        print "Spider spec :"
        spec["name_module"] = name_module
        # copy the xpath fields into "xpath"
        if "xpath" not in spec:
            spec["xpath"] = {}
            for field in ('name', 'price', 'category', 'description', 'images',
                          'canonical', 'base_url', 'brand', 'in_stock',
                          'guarantee', 'promotion'):
                if field in spec:
                    spec["xpath"][field] = spec[field]

        spec['hashtag_all_rule'] = "#"
        spec['hashtag_no_rule'] = ""
        if 'allowed_domain' not in spec:
            spec["allowed_domain"] = getDomain(spec["start_url"])
        if 'spider_class' not in spec:
            spec['spider_class'] = getClassNameFromDomain(
                spec['allowed_domain'])
        if 'item_url_pattern' not in spec:
            spec['item_url_pattern'] = ''
        if 'follow_link_pattern' not in spec:
            spec['follow_link_pattern'] = ''
        if 'all_links_pattern' not in spec:
            spec['all_links_pattern'] = ''
        if 'all_links_pattern' in spec and spec['all_links_pattern'] != '':
            spec['hashtag_no_rule'] = '#'
            spec['hashtag_all_rule'] = ''
        if 'type' not in spec or spec['type'] == "":
            spec['type'] = 'crawl'
        spec['detail_module'] = SPIDER_TYPE[spec['type']]['detail_module']
        spec['detail_class'] = SPIDER_TYPE[spec['type']]['detail_class']

        pprint.pprint(spec, indent=4)
        output_spider = tpl_spider.render(template.Context({'spider': spec}))
        output_storage = tpl_storage_spider.render(
            template.Context({'spider': spec}))

        open(output_storage_py, "w").write(output_storage)
        msg = "Scraper written to %s" % output_storage_py
        if over_write == 'true' and os.path.exists(
                output_spider_py) or not os.path.exists(output_spider_py):
            open(output_spider_py, "w").write(output_spider)
            msg += " and %s" % output_spider_py
        else:
            logging.warning("Spider file exists: %s", output_spider_py)
        logging.info(msg)
Beispiel #10
0
 def check_item(self, item):
     if item is not None:
         if 'canonical' in item and item['canonical'] is not None and len(
                 item['canonical']) > 0:
             prop_canonical = item['canonical'][0]
             if prop_canonical != item['origin_url']:
                 logging.warning("=======> Item duplicate: " + item['url'] +
                                 ", we re-update it")
                 item['origin_url'] = prop_canonical
                 item['url'] = self.add_tracking_code(prop_canonical)
         if 'name' in item and item['name'] is not None and len(
                 item['name']) > 0:
             for name in item['name']:
                 if name in BLACK_PRODUCT_NAME:
                     logging.warning(
                         "=======> Item expired because name in black product name: "
                         + name + " at " + item['url'])
                     return None
                 for black_name_content in BLACK_PRODUCT_NAME_CONTENT:
                     if black_name_content.lower() in name.lower():
                         logging.warning(
                             "=======> Item expired because name contain black product name content: "
                             + name + " at " + item['url'])
                         return None
         else:
             logging.warning(
                 "=======> Item expired because invalid name: " +
                 item['url'])
             return None
         for black_url in BLACK_PRODUCT_URL:
             if item['origin_url'] == black_url:
                 logging.warning(
                     "=======> Item expired because name in black product link: "
                     + item['url'])
                 return None
         if (not 'images' in item and not 'price' in item):
             logging.warning(
                 "=======> Item expired because invalid images and price: "
                 + item['url'])
             return None
     return item