コード例 #1
0
ファイル: extension.py プロジェクト: dovanduy/choinho
 def push_log(self, spider_name, stats):
     self.connect_kafka()
     stats['start_time'] = time.mktime(stats['start_time'].timetuple())
     stats['finish_time'] = time.mktime(stats['finish_time'].timetuple())
     stats['spider'] = spider_name
     try:
         #logging.info(stats)
         self.kafka.send_messages(KAFKA_LOG_STATS_TOPIC,
                                  *[json.dumps(stats)])
     except FailedPayloadsError as e:
         logging.error(e)
         logging.info(stats)
     del stats['spider']
     for key in stats.keys():
         if '.' in key:
             del stats[key]
     stat_spider_old = self.collection_stats.find_one(
         {"spider": spider_name})
     if stat_spider_old:
         stat_spider_new = stat_spider_old
     else:
         stat_spider_new = {'spider': spider_name, 'stats': []}
     if len(stat_spider_new['stats']) < 7:
         stat_spider_new['stats'].append(stats)
     else:
         stat_spider_new['stats'].pop(0)
         stat_spider_new['stats'].append(stats)
     stat_spider_new['last_history'] = stats
     self.collection_stats.update({"spider": spider_name},
                                  {"$set": stat_spider_new},
                                  upsert=True,
                                  multi=False)
コード例 #2
0
ファイル: couchbase_util.py プロジェクト: dovanduy/choinho
def createDb(name,
             user="",
             passwd="",
             ram=100,
             replica=0,
             server="http://localhost:8091/pools/default/buckets"):
    """ Create a new bucket by using system curl command
    """
    # curl -X POST -u username:password -d name=newbucket -d ramQuotaMB=100 -d authType=none
    # -d replicaNumber=1 -d proxyPort=11216 http://localhost:8091/pools/default/buckets
    command = "curl -X POST -u %s:%s -d name=%s -d ramQuotaMB=%s -d authType=sasl " \
                    "-d replicaNumber=%s %s" \
                    % (user, passwd, name, ram, replica, server)
    import commands
    _, output = commands.getstatusoutput(command)
    lines = output.split("\n")
    if len(lines) < 4:
        logging.info("Create new bucket: %s" % name)
        return True
    response = json.loads(lines[3])
    if 'errors' in response:
        logging.error(response)
        return False
    else:
        logging.info("Create new bucket: %s" % name)
        return True
コード例 #3
0
def generateSpider(spider_name, over_write):
    logging.info('generateSpider')
    commands = []
    commands.append(config.PYTHON)
    commands.append('generator/runner.py')
    commands.append('generator/generate.py')
    commands.append('--mongo_server')
    commands.append(config.MONGO_SERVER)
    commands.append('--spider_name')
    commands.append(spider_name)
    commands.append('--spider_template')
    commands.append('generator/spider_template.django.py')
    commands.append('--output_spider_py')
    commands.append('scraper/spiders/' +
                    getFileNameFromSpiderName(spider_name))
    commands.append('--over_write')
    commands.append(str(over_write))
    commands.append('--storage_spider_template')
    commands.append('generator/storage_spider_template.django.py')
    commands.append('--output_storage_py')
    commands.append('scraper/storage_spiders/' +
                    getFileNameFromSpiderName(spider_name))
    logging.info(" ".join(commands))
    msg = execute(commands)
    return msg
コード例 #4
0
ファイル: couchbase_util.py プロジェクト: dovanduy/choinho
def copy_couchdb_to_couchbase(fromDb, toDb, batch_size=10000):
    batch = {}
    cnt = 0
    for doc in couch_util.get_pager(fromDb):
        del doc['_rev']
        batch[doc['_id']] = doc
        if len(batch) > batch_size:
            try:
                toDb.upsert_multi(batch)
                cnt += len(batch)
            except TemporaryFailError:
                logging.warning(
                    "Connection timeout. Try to break and update batch")
                for key, value in batch.items():
                    toDb.upsert(key, value)
                    cnt += 1
            batch = {}
            logging.info("Copied %s docs" % cnt)
    if len(batch) > 0:
        try:
            toDb.upsert_multi(batch)
            cnt += len(batch)
        except:
            logging.warning(
                "Connection timeout. Try to break and update batch")
            for key, value in batch.items():
                toDb.upsert(key, value)
                cnt += 1
        logging.info("Copied %s docs" % cnt)
    print "Done"
コード例 #5
0
ファイル: extension.py プロジェクト: dovanduy/choinho
 def close_all_connect(self):
     #         if self.collection_spider:
     #             self.collection_spider.close()
     if self.kafka:
         self.kafka.stop()
         self.kafka.client.close()
     logging.info("Closed all connection.")
コード例 #6
0
ファイル: couch_util.py プロジェクト: dovanduy/choinho
def delete(db, docid):
    try:
        db.delete(db[docid])
        logging.info("Doc %s is deleted!" % docid)
    except:
        logging.warning("Can not delete doc %s in %s" % (docid, db.name))
        pass
コード例 #7
0
ファイル: couch_util.py プロジェクト: dovanduy/choinho
def createOrMergeBatch(db, doc_batch):
    """ create new or merge with existing in batch.
    
    Input is a list of couchdb.Document objects.
    """
    assert type(doc_batch) == list, "Bad input %s" % type(doc_batch)

    # break down doc_batch if doc_batch too large
    try:
        responses = db.update(doc_batch)
    except:
        logging.warning(
            "Error with doc batch of size %s. Try to break it down" %
            len(doc_batch))
        responses = []
        for doc in doc_batch:
            responses.extend(db.update([doc]))
    for (success, docid, rev_or_exc), doc in zip(responses, doc_batch):
        if not success:
            assert type(rev_or_exc) == ResourceConflict
            if docid == doc["_id"]: continue  #same doc, updated twice.
            logging.info("Merging doc %s with %s" % (doc["_id"], docid))
            newDoc = db[docid]
            if mergeDoc(newDoc, doc):
                db[docid] = newDoc
コード例 #8
0
ファイル: couchbase_util.py プロジェクト: dovanduy/choinho
def deleteDocsByIds(db, docids):
    oks = db.remove_multi(docids, quiet=True)
    key_not_found = 0
    for docid in docids:
        if oks[docid].rc == 0xD:
            key_not_found += 1
            logging.warning("Not found key %s to delete" % docid)
    logging.info("Deleted %d docs" % (len(docids) - key_not_found))
コード例 #9
0
def checkJsonGreater(url, xpath, expectValue):
    logging.info("checkJsonGreater %s with %s" % (expectValue, url))
    value = xtractFromUrl(url, xpath)
    if value is None: return False
    try:
        return int(value) > expectValue
    except:
        logging.warn('%s is not int', value)
        return False
コード例 #10
0
def checkContentContain(url, expectContent):
    logging.info("checkContentContain %s  with %s" % (expectContent, url))
    try:
        response = requests.get(url)
        content = response.text.lower()
        return content.find(expectContent.lower()) >= 0
    except:
        logging.error('Cannot load content from %s', url)
        return False
コード例 #11
0
 def get_requests(self, url):
     try:
         req = requests.get(url,
                            headers=HDR,
                            allow_redirects=False,
                            timeout=60)
         return req
     except Exception as e:
         logging.exception(e)
         logging.info("Get content from %s null", url)
         return None
コード例 #12
0
def get_requests(url):
    HDR = {
        'User-Agent':
        'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'
    }
    try:
        req = requests.get(url, headers=HDR, allow_redirects=False, timeout=20)
        return req
    except Exception as e:
        logging.exception(e)
        logging.info("Get content from %s null", url)
        return None
コード例 #13
0
def tailLog(spider_name, number_lines, files=None):
    logging.info('tailLog')
    if files:
        logFile = config.LOG_DIR + "/" + spider_name + '_from_' + files + '.log'
    else:
        logFile = config.LOG_DIR + "/logs_crawl/" + spider_name + '.log'
    commands = []
    commands.append('tail')
    commands.append('-' + str(number_lines))
    commands.append(logFile)
    msg = execute(commands)
    return msg
コード例 #14
0
    def parse_item(self, response, url=None):
        origin_url = None
        if url is None:
            if self.from_url_file == None:
                self.parse(response)
            hxs = Selector(response)
            expired = self.setExpiredItemsBaseOnStatus(response.url,
                                                       response.status)
            origin_url = response.url
            url = self.add_tracking_code(response.url)
        else:
            origin_url = self.remove_tracking_code(url)
            req = self.get_requests(origin_url)
            if req:
                expired = self.setExpiredItemsBaseOnStatus(
                    origin_url, req.status_code)
                if not expired:
                    hxs = Selector(text=req.content)
                else:
                    return None
            else:
                return None
        if not expired:
            logging.info('======================>EXTRACT %s', origin_url)
            item = Product()
            item['source'] = self.name
            item['origin_url'] = origin_url
            item['url'] = self.add_tracking_code(origin_url)
            item['timestamp'] = time.time()

            for prop, xpath in self.xpath_dict.items():
                if xpath.strip():
                    try:
                        item[prop] = cleanText(hxs.xpath(xpath).extract())
                    except KeyError:
                        continue
                    if not hxs.xpath(
                        (xpath)).extract() or hxs.xpath(xpath).extract() == "":
                        del item[prop]
                    if prop == "description":
                        item["description"] = hxs.xpath(
                            xpath + "/node()[not(self::script)]").extract()
                    if prop == "property":
                        item['property'] = hxs.xpath(
                            xpath + "/node()[not(self::script)]").extract()
            item = test_data.process_data(item)
            item = self.check_item(item)
            if item is not None and self.isValid(item):
                return item
            else:
                return None
        elif url is not None:
            return None
コード例 #15
0
def delete(spider_name, _type):
    if _type == "dir":
        commands = 'rm -rf ' + config.LOG_DIR + 'crawls/' + spider_name
    elif _type == "py":
        commands = 'rm -rf ' + config.CRAWL_DIR + 'scraper/spiders/' + getFileNameFromSpiderName(
            spider_name)
    elif _type == "all":
        delete_spider_mongo(spider_name)
        commands = 'rm -rf ' + config.LOG_DIR + 'crawls/' + spider_name
        commands += ' && rm -rf ' + config.CRAWL_DIR + 'scraper/spiders/' + getFileNameFromSpiderName(
            spider_name)
    logging.info(" ".join(commands))
    msg = os.system(commands)
    return msg
コード例 #16
0
ファイル: couch_util.py プロジェクト: dovanduy/choinho
def deleteDocsByIds(db, docids):
    docBatch = []
    for doc in getDocsByIds(db, docids):
        if doc:
            doc["_deleted"] = True
            docBatch.append(doc)
    responses = db.update(docBatch)

    deleteCount = 0
    for (success, docid, _) in responses:
        if not success:
            logging.info("Failed to delete doc %s", docid)
        else:
            deleteCount += 1
    logging.info("Deleted %d docs", deleteCount)
コード例 #17
0
 def parse_item_and_links(self, response):
     item = self.parse_item(response)
     if item:
         logging.info("Link item: %s", response.url)
         yield item
     else:
         logging.info("Not link item: %s", response.url)
     for rule in self.rules:
         if not rule.link_extractor:
             continue
         links = rule.link_extractor.extract_links(response)
         for link in links:
             if link.url.startswith("http"):
                 yield self.request(link.url,
                                    callback=self.parse_item_and_links)
コード例 #18
0
ファイル: couchbase_util.py プロジェクト: dovanduy/choinho
def copy_couchbase_to_kafka(fromDb,
                            kafka_producer,
                            kafka_topic,
                            batch_size=1000):
    batch = []
    for doc in get_pager(fromDb):
        batch.append(doc)
        if len(batch) > batch_size:
            kafka_producer.send_messages(kafka_topic,
                                         *[json.dumps(msg) for msg in batch])
            batch = []
    if len(batch) > 0:
        kafka_producer.send_messages(kafka_topic,
                                     *[json.dumps(msg) for msg in batch])
    logging.info("Saving Couchbase Bucket %s -> Kafka %s. Done!" %
                 (fromDb.bucket, kafka_topic))
コード例 #19
0
    def test_process(self):
        try:
            res = HttpRequest(self.status, self.http_method, self.url,
                              self.params).http_request()
        except Exception as e:
            test_result = 'Error'
            logging.exception(e)
            print("{0},Error msg:{1}".format(test_result, e))
            raise e

        else:
            Actual_Result = res.json()
            # self.getlogging.get_logging("INFO","响应体:{}".format(ActualResult))
            # print(ActualResult)
        try:
            for key in self.ExpectedResult.keys():  # 遍历预期结果的key值
                # print(key)
                # 对比预期和实际结果中,对应的值是否一样
                self.assertTrue(self.ExpectedResult[key] == Actual_Result[key])
            test_result = 'Pass'
            logging.info(test_result)
        # if self.ExpectedResult["reason"] ==ActualResult['reason'] and
        # self.ExpectedResult['error_code']==ActualResult['error_code']:
        except Exception as e:
            test_result = 'Fail'
            # log写入fail的原因,是由于预期和实际结果,key相同的情况下,value值不一致导致
            logging.info(
                "{0},ExpectedResult is not equal to ActualResult, wrong msg({1}):[{2}]!=[{3}]"
                .format(test_result, key, self.ExpectedResult[key],
                        Actual_Result[key]))
            print(
                "Test Fail,ExpectedResult is not equal to ActualResult, wrong msg({0}):[{1}]!=[{2}]"
                .format(key, self.ExpectedResult[key], Actual_Result[key]))
            raise e
        finally:
            # self.getlogging.get_logging("INFO","test fail")
            # test_result = 'Fail'
            # print("Test Fail")
            # print("test_result:{}".format(test_result))
            # 重新写会excel文件,由于actualresult文件是json,则需要转化到str,ensure_ascii=False-->控制输出中文
            ManageExcel().write_back(
                self.sheet_name, self.case_id + 1, test_result,
                json.dumps(Actual_Result, ensure_ascii=False))
コード例 #20
0
def getUrls(source, sincedays, beforedays, missing, start=0, limit=1000):
    logging.info("Get data...")
    url_query = source_api
    if source is not None:
        url_query += "&source=" + source
    if sincedays is not None:
        url_query += "&sincedays=" + str(sincedays)
    if beforedays is not None:
        url_query += "&beforedays=" + str(beforedays)
    if missing is not None:
        url_query += "&missing=" + missing
    urls = []
    while True:
        url_query += '&start=' + str(start) + '&limit=' + str(limit)
        urls.extend(json.loads(requests.get(url_query).text)['hits'])
        start += limit
        if start >= DEFAULT_LIMIT:
            break
    logging.info("Done!")
    return urls
コード例 #21
0
ファイル: util_crawler.py プロジェクト: dovanduy/choinho
def getAllDocIdsBySource(es_server, source, limit=10000):
    base_url = es_server + "?q=source:%s&_source=false" % source
    start = 0
    limit = limit
    docids = []
    cnt = 0
    while True:
        url = base_url + "&from=%d&size=%d" % (start, limit)
        data = json.loads(requests.get(url).text)
        if 'hits' in data:
            for doc in data['hits']['hits']:
                docids.append(doc['_id'])
                cnt += 1
            logging.info("Get %d docids", cnt)
            if len(data['hits']['hits']) < limit:
                break
        else:
            break
        start += limit
    return docids
コード例 #22
0
def cache_images(image_urls):
    images_not_cached, images_cached = download_images(image_urls)
    cnt = 0
    images = {}
    images['base_url'] = config.BASE_URL
    images['images'] = []
    images_id = {}
    for image in images_not_cached:
        image_cached = {}
        width, height = image['image'].size
        if width < MIN_WIDTH or height < MIN_HEIGHT:
            continue
        try:
            image_full, path_full = create_path_image(image['image'],
                                                      image['name'])
        except:
            continue
        save_to_s3(image_full[1], path_full)
        images_id[image['name']] = image['url']
        image_cached['image_id'] = image['name']
        image_cached['thumbs_type'] = []
        for thum_id, size in IMAGES_THUMBS.iteritems():
            thum_img, path = create_path_image(image['image'], image['name'],
                                               thum_id)
            save_to_s3(thum_img[1], path)
            image_cached['thumbs_type'].append(thum_id)

        if image_cached:
            images['images'].append(image_cached)
        if images_id:
            r.mset(images_id)
        cnt += 1

    logging.info("Cached %s images" % (cnt))

    for image_id in images_cached:
        image_cached = {}
        image_cached['image_id'] = image_id
        image_cached['thumbs_type'] = ['big', 'normal', 'small']
        images['images'].append(image_cached)
    return images
コード例 #23
0
def startSpider(spider_name, crawler_server):
    if spider_name is not None:
        if spider_name in spiders_running:
            logging.error("Spider name \"%s\" running", spider_name)
            spider = collection.find_one({'doc.spider': spider_name})
            logging.info("Info spider: %s", spider['crawler_status'])
            return crawler_server
        try:
            requests.get('http://' + crawler_server['name_server'] +
                         '.localhost:' + SERVER_PORT +
                         '/crawler/startcrawl?spider=' + spider_name)
            crawler_server = set_free_thread(crawler_server)
            logging.info("Start spider \"%s\" successful at \"%s\"!",
                         spider_name, crawler_server['name_server'])
        except (Timeout, ConnectionError):
            crawler_server['status'] = False
            logging.error("Start spider \"%s\" failed at \"%s\"!", spider_name,
                          crawler_server['name_server'])
    else:
        logging.error("Spider is None!")
    return crawler_server
コード例 #24
0
ファイル: couch_util.py プロジェクト: dovanduy/choinho
def mergeDoc(existing_doc, new_doc):
    """ existing_doc is merged with new_doc. 
    
    Returns true/false if existing_doc is modified.
    """
    records = existing_doc.setdefault("records", [])
    if 'records' not in new_doc:
        return False
    isModified = False
    for new_record in new_doc['records']:
        if new_record not in records:
            records.append(new_record)
            isModified = True
    logging.info("# merged records %d " % len(records))

    # Merge images.
    images = existing_doc.setdefault("images", [])
    for new_image in new_doc['images']:
        if new_image not in images:
            images.append(new_image)
            isModified = True
    logging.info("# merged images %d " % len(images))

    # Merge sources.
    sources = existing_doc.setdefault("sources", [])
    for new_source in new_doc['source']:
        if new_source not in sources:
            sources.append(new_source)
            isModified = True
    logging.info("# merged sources %d " % len(sources))

    return isModified
コード例 #25
0
def buildListCrawlSpiders():
    validSpiders = getAllValidSpiders()
    list_spiders = []
    now = time.time()
    spider = {}
    logging.info("=====>Please wait, we're building spider list...")
    try:
        for spi in validSpiders:
            try:
                spider['name'] = spi['doc']['spider']
                crawled_time = (float(now) -
                                spi['crawler_status']['last_stop_time']) / 3600
                spider['score'] = calculateSpiderScore(
                    spi['crawler_status'].get('priority', 1), crawled_time,
                    spi['crawler_status'].get('items', 1000),
                    spi['crawler_status']['status'])
                list_spiders.append(spider)
                spider = {}
            except KeyError:
                continue
    except NetworkTimeout, ServerSelectionTimeoutError:
        sleep(60)
コード例 #26
0
def startCrawl(spider_name):
    logging.info('startCrawl')
    logFile = config.LOG_DIR + 'logs_crawl/' + spider_name + '.log'
    jobdir_crawl = config.LOG_DIR + 'crawls/' + spider_name
    #kill process
    commands = 'ps -ef | grep python | grep ' + spider_name + '  | grep -v grep | awk \'{print $2}\' | xargs kill -2'
    logging.info(" ".join(commands))
    os.system(commands)
    commands = 'scrapy crawl ' + spider_name + ' -s JOBDIR=' + jobdir_crawl + ' &> ' + logFile + ' &'
    logging.info(" ".join(commands))
    os.system(commands)

    return '/crawler/taillog?spider=' + spider_name
コード例 #27
0
ファイル: util_crawler.py プロジェクト: dovanduy/choinho
def process_source_docs(source, command_type, batch_size=1000):
    docids = getAllDocIdsBySource(ES_SERVER, source)
    batch = []
    cnt = 0
    for docid in docids:
        batch.append(docid)
        cnt += 1
        if batch_size <= len(batch):
            response = send_command(COMMAND_API, command_type, batch)
            if response.status_code != 200:
                raise Exception("Error when execute command %s", command_type)
            logging.info("Sent to %s %d docs", command_type, cnt)
            batch = []
            time.sleep(5)
    if len(batch) > 0:
        send_command(COMMAND_API, command_type, batch)
        logging.info("Sent to %s %d docs", command_type, cnt)
    logging.info("Done %s %d docs from source %s", command_type, cnt, source)
コード例 #28
0
def startServiceMaster():
    logging.info("Start master-service!")
    list_crawler_servers = getCrawlerServers()
    list_spiders = buildListCrawlSpiders()
    count_start = 0
    if list_spiders is not None and len(list_spiders) > 0:
        for spider in list_spiders:
            crawler_server, list_crawler_servers = chooseCrawlerServer(
                list_crawler_servers)
            if crawler_server is not None:
                crawler_server = startSpider(spider['name'], crawler_server)
                list_crawler_servers = updateCrawlerServers(
                    crawler_server, list_crawler_servers)
                count_start += 1
            else:
                logging.info("=======> All crawler servers were full!")
                break
        logging.info("Started %d spiders." % (count_start))
    else:
        logging.error("No result from MongoDb!")
コード例 #29
0
ファイル: extension.py プロジェクト: dovanduy/choinho
 def remove_dir_crawl(self, spider_name):
     commands = 'rm -rf ' + config.LOG_DIR + 'crawls/' + spider_name
     logging.info(" ".join(commands))
     os.system(commands)
コード例 #30
0
 def check(self, value):
     self.cnt = self.cnt + 1
     if self.cnt % value == 0:
         logging.info('Processed %s', self.cnt)
コード例 #31
0
    def __pruning_bfs(self, lambd, max_neighbors, queue_size):
        """
        剪枝广度优先搜素
        :param lambd:
        :param max_neighbors:
        :param queue_size:
        :return:
        """

        # 存放搜索到的路径
        results = []
        # 起始结点
        start = (self.start + self.sep + self.start, 0)
        # 终止结点
        stop = (self.stop + self.sep + self.stop, 0)

        queue = Queue.Queue(queue_size)
        # 起始结点入栈
        queue.put([start])

        while not queue.empty():

            # 出队
            phrase = queue.get()

            # 获取当前短语的最后一个单词
            node = phrase[len(phrase) - 1]

            if stop == node:
                # 已经是最后一个结点
                if len(phrase) >= 8:
                    # 只选择长度在8个单词以上的句子
                    results.append(phrase)
                continue

            # 将当前短语转换成字符串形式
            str_phrase = ''
            for nodeflag, num in phrase:
                str_phrase += nodeflag.split(self.sep)[0] + ' '

            logging.info('results size[%d] queue size[%d] phrase[%s]', len(results), queue.qsize(), str_phrase)

            # 获取当前结点的邻接后继结点
            pos_neighbors = self.graph.neighbors(node)
            # 每个后继结点的综合得分(考虑路径得分和语言模型得分)
            neighbor_weight = {}
            # 依次处理每个后继结点
            for pos_neighbor in pos_neighbors:
                # 获取两个结点之间边的权重
                edge_weight = self.graph.get_edge_data(node, pos_neighbor)['weight']
                if edge_weight == 0:
                    continue

                # 计算当前结点与之前语句构成的新的语句的语言模型得分
                fluency_weight = self.grammar_scorer.cal_fluency(str_phrase + pos_neighbor[0].split(self.sep)[0])

                # 计算综合得分
                general_score = 1 / edge_weight + lambd * fluency_weight / (len(re.split('\s+', str_phrase)) + 1)

                logging.info("lambd[%f] general score[%f] edge weight[%f] fluency weight[%f]", lambd, general_score, edge_weight, fluency_weight)

                # 计算当前后继结点的综合得分
                neighbor_weight[pos_neighbor] = general_score

            # 将后继结点按综合得分由大到小进行排序(可以考虑改成推排序来提升性能)
            sort_neighbor_weight = sorted(neighbor_weight.iteritems(), key=lambda neighbor_weight : neighbor_weight[1], reverse=True)

            # 选择指定数目的结点如队列
            for i in range(min(max_neighbors, len(sort_neighbor_weight))):

                if queue.full():
                    break

                # 综合得分最高的max_neighbors个邻接后继结点如队列
                new_phrase = phrase + [sort_neighbor_weight[i][0]]
                queue.put(new_phrase)

        return results