Example #1
0
    def __init__(self, profile):
	self.secret_access_key = profile["secret_access_key"]
	self.access_id = profile["access_id"]

	config.ACCESS_KEY = safestr(self.access_id)
	config.SECRET_KEY = safestr(self.secret_access_key)

	if "bucket" in profile and profile["bucket"]:
	    self.bucket = profile["bucket"]
	else:
	    self.bucket = "cmdp"

	self.token_expires = 3600 + now()
	self.init_tokent()
Example #2
0
    def __init__(self, profile):
        self.secret_access_key = profile["secret_access_key"]
        self.access_id = profile["access_id"]

        config.ACCESS_KEY = safestr(self.access_id)
        config.SECRET_KEY = safestr(self.secret_access_key)

        if "bucket" in profile and profile["bucket"]:
            self.bucket = profile["bucket"]
        else:
            self.bucket = "cmdp"

        self.token_expires = 3600 + now()
        self.init_tokent()
Example #3
0
def process_kaice(db, insert_data, data):
    insert_data["insert_time"] = str(time.strftime("%Y-%m-%d %H:%M:%S"))

    r = db.get_one(where={ "guid" : insert_data["guid"]})
    if r:
	return None;
    
    if data["kaice_type"] and safestr(data['kaice_type'].value) == "网页游戏":
	if "test_date" in insert_data:
	    insert_data["test_date"] = safestr(insert_data["test_date"]).replace("今日", str(time.strftime("%Y-%m-%d")))

	if insert_data["game_name"] is None or insert_data["game_name"] == "None" or insert_data["game_name"] == "":
	    return None

	return insert_data
    else:
	return None
Example #4
0
 def specialFilter(self):
     if len(self.filters) > 0:
         for filter in self.filters:
             rule = filter;
             rule = rule.replace('(*)', '(.+)?')
             if isinstance(self.content, unicode):
                     rule = safeunicode(rule)
             else:
                     rule = safestr(rule)
             self.content = re.compile(rule, re.I).sub("", self.content);
Example #5
0
    def getItemGUID(self, data):
	guid_rule = self.guid_rule
	s = "";

	if isinstance(guid_rule, list):
	    for field_id in guid_rule:
		field = get_field_from_cache(field_id)
		if field:
		    field_name = field["name"]
		    if field_name and data[field_name]:
			if "value" in data[field_name] and data[field_name].value:
			    s += safestr(data[field_name].value)
			elif data[field_name] and isinstance(data[field_name], unicode) and isinstance(data[field_name], str):
			    s += safestr(data[field_name])

	elif isinstance(guid_rule, str) or isinstance(guid_rule, unicode):
	    s = data[guid_rule]

	return md5(s).hexdigest()
Example #6
0
def process_kaifu(db, insert_data, data):
    insert_data["insert_time"] = str(time.strftime("%Y-%m-%d %H:%M:%S"))

    r = db.get_one(where={ "guid" : insert_data["guid"]})
    if r:
	return None;

    test_date = safestr(insert_data['test_date'])
    
    today = datetime.datetime.today()

    today_month = today.month
    today_day = today.day
    today_year = today.year

    today_string = "%.2d月%.2d日" % (today_month, today_day)
    test_date = test_date.replace("今日", today_string)

    for rule in date_rule:
	try:
	    new_test_date = time.strptime(safestr(test_date), rule)
	    if new_test_date:
		y = new_test_date[0]
		m = new_test_date[1]
		d = new_test_date[2]
		h = new_test_date[3]
		if y == 1900:
		    y = today_year

		new_test_date = datetime.datetime(y, m, d, h)
		insert_data["test_date"] = str(new_test_date)
		break;
	except:
	    pass

    if insert_data["game_name"] is None or insert_data["game_name"] == "None" or insert_data["game_name"] == "":
	return None

    return insert_data
Example #7
0
    def fetchListPages(self, listtype="html"):
        print "Start to fetch and parse List"
	urls = self.listRule.getListUrls()
        for url in urls:
	    print "Fetching list page: ", url, "charset:", safestr(self.seed["charset"]), "timeout:", safestr(self.seed["timeout"])
            f = Fetch(url, charset = self.seed["charset"], timeout = self.seed["timeout"])
	    if f.isReady():
		doc = f.read()

		if listtype == "html":
		    self.parseListPage(f, doc, url)
		elif listtype == "json":
		    self.parseJsonPage(f, doc, url)

        print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())
Example #8
0
def getElementData(obj, rule, images=None, fetch_all=0):
    """
    根据rule对obj的进行解析
    obj可以是pq后的对象, 也可以是html页面
    images将会把解析过程的image连接插入此表中

    规则可以有两种模式:
    1. DOM selector
	1.1 选择器类似于jquery 比如你要某个a的url
	    >> a.attr("href")
	1.2 需要一个标签内的文本内容
	    >> div[id="content"].text()
	1.3 需要获得某个子元素中的内容
	    >> li.eq(1).text()    #li元素组中的第2个文本内容
    2. 正则模式
	正则模式需要的内容使用[arg]标签,其余可以使用(*)填充
    """
    if not isinstance(obj, pq):
	obj = pq(obj);
    
    old_rule = rule
    rule = rule.split(".")
    
    #避免有url链接
    if len(rule) > 1 and old_rule.find("[arg]") == -1:
	#第一个永远是dom选择
	selectRule = rule.pop(0)
	#移除 ( )
	selectRule = selectRule.replace("(", "");
	selectRule = selectRule.replace(")", "");

	selecteddom = obj.find(selectRule);

	for attr in rule:
	    m = attrParrent.match(attr)
	    if m:
		action, v = m.groups()
		if v:
		    v = v.encode("utf-8")
		    #去除引号
		    v = v.strip("\'").strip('\"');

		if action == "attr" and hasattr(selecteddom, "attr") and v:
		    if fetch_all == 1:
			values = []
			dom_count = len(selecteddom)

			for i in range(dom_count):
			    vv = selecteddom.eq(i).attr(v)
			    if vv:
				values.append(vv)
				if is_image(vv):
				    images.append(vv)
			
			return values
		    else:
			value = selecteddom.attr(v)
			if selecteddom and selecteddom[0].tag == "img" and v == "src" and images is not None:
			    images.append(value)

			return value
		elif action == "eq" and hasattr(selecteddom, "eq"):
		    _rules = attr.split(" ")
		    if len(rule) > 1:
			selecteddom = selecteddom.eq(int(v))
			if len(_rules) > 1:
			    '''
			    假设eq后面还有子元素
			    eq(1) a
			    '''
			    _rules.pop(0)
			    _dom = " ".join(_rules)    
			    selecteddom = selecteddom.find(_dom)
		    else:
			return selecteddom.eq(int(v))
		elif action == "text" and hasattr(selecteddom, "text"):
		    return safeunicode(selecteddom.text()).strip()
		elif action == "html" and hasattr(selecteddom, "html"):
		    return safeunicode(selecteddom.html()).strip()

    elif len(rule) == 1:
	rule = rule.pop()
	#正则模式
	if rule.find('[arg]'):
	    content = obj.html()
	    content_text = obj.text()

	    rule = rule.replace('[arg]', '(.+)?')
	    rule = rule.replace('(*)', '.+?')

	    if isinstance(content, unicode):
		rule = safeunicode(rule)
	    else:
		rule = safestr(rule)

	    parrent = re.compile(rule, re.MULTILINE | re.UNICODE)
	    try:
		result = parrent.search(content)
		if result is not None:
		    result = safeunicode(result.group(1)).strip()
		    return result
		else:
		    result = parrent.search(content_text)
		    if result is not None:
			result = safeunicode(result.group(1)).strip()
			return result
	    except:
		return None
    
    return None