Exemple #1
0
 def add_good(self, goods_name, goods_content='', goods_num=1):
     """
     添加商品
     :param goods_num:商品数量
     :param goods_name:商品名
     :param goods_content:商品备注
     :return:
     """
     data = Storage()
     data.goods_num = goods_num
     data.goods_name = goods_name
     data.goods_content = goods_content
     self.goods.append(data)
Exemple #2
0
 def get_sessionid(self):
     """
     获取sessionid
     :return:
     """
     data = Storage()
     data.username = 15201287981
     data.password = "******"
     url = 'http://{host}:{port}/marathon/user/login'.format(host=self.host, port=self.port)
     resp = requests.post(url, params=data, timeout=self.timeout)
     if resp.status_code != 200:
         return {"respcd": resp.status_code}
     cookies = {'sessionid':json.loads(resp.text)['data']['sessionid']}
     return cookies
Exemple #3
0
    def accepts(
        self,
        vars,
        session=None,
        formname='default',
        keepvalues=False,
        onvalidation=None,
        ):
        self.errors.clear()
        self.request_vars = Storage()
        self.request_vars.update(vars)
        self.session = session
        self.formname = formname
        self.keepvalues = keepvalues

        # if this tag is a form and we are in accepting mode (status=True)
        # check formname and formkey

        status = True
        if self.session and self.session.get('_formkey[%s]'
                 % self.formname, None) != self.request_vars._formkey:
            status = False
        if self.formname != self.request_vars._formname:
            status = False
        status = self._traverse(status)
        if status and onvalidation:
            onvalidation(self)
        if self.errors:
            status = False
        if session != None:
            self.formkey = session['_formkey[%s]' % formname] = \
                str(uuid.uuid4())
        if status and not keepvalues:
            self._traverse(False)
        return status
Exemple #4
0
    def load_tests(self):
        """
        Iterate through the plugoos inside the folder specified by the
        config file and instantiate them.
        """
        pluginfiles = [fname[:-3] for fname in os.listdir(self.config.main.testdir)\
                         if fname.endswith(".py")]
        for fname in pluginfiles:
            test = Storage()
            test_name = fname
            if not self.config.main.testdir in sys.path:
                sys.path.insert(0, self.config.main.testdir)
            #print "Fname: %s\n__import__(%s)" % (fname, fname)
            #print sys.path
            module = __import__(fname)
            try:
                test.name = module.__plugoo__
                test.desc = module.__desc__
                test.module = module
            except Exception, e:
                self.logger.warning("Soft fail %s", e)
                test.name = test_name
                test.desc = ""
                test.module = module

            try:
                self.tests[test_name] = test
            except Exception, e:
                print "Failed to load the test %s %s" % (name, e)
Exemple #5
0
	def __init__(self, environ, render=None):
		"""Parses the given WSGI environment to construct the request."""
		self.request = Storage()
		self.response = Storage()

		self._render = render
		self.request.method = environ["REQUEST_METHOD"]
		self.request.protocol = environ["wsgi.url_scheme"]
		self.request.remote_ip = environ.get("REMOTE_ADDR", "")
		self.request.path = environ.get("PATH_INFO")
		if environ.get('QUERY_STRING'):
			self.request.query = '?' + environ.get('QUERY_STRING', '')
		else:
			self.request.query = ''
		self.request.fullpath = self.request.path + self.request.query
		self.request.version = "HTTP/1.1"
		self.request.headers = {}
		self.request.headers["Content-Type"] = environ.get("CONTENT_TYPE", "")
		self.request.headers["Content-Length"] = environ.get("CONTENT_LENGTH", "")
		for key in environ:
			if key.startswith("HTTP_"):
				self.request.headers[key[5:].replace("_", "-")] = environ[key]
		if self.request.headers.get("Content-Length"):
			self.request.body = environ["wsgi.input"].read(
				int(self.request.headers["Content-Length"]))
		else:
			self.request.body = ""
		if environ.get("HTTP_HOST"):
			self.request.host = environ["HTTP_HOST"]
		else:
			self.request.host = environ["SERVER_NAME"]

		self.response.status = "200 OK"
		#self.response.headers = {"Content-Type":"text/html; charset=UTF-8"}
		#如果设置了Content-Type,则必须设置Content-Length,设置了length,可不必设置type
		#如果不设置Content-Length,就容易出现[Errno 32]Broken pipe
		self.response.headers = []
		#self.response.headers = [("Content-Type","text/html; charset=UTF-8")]
		self.request.arguments = self.rawinput()
		self.initialize()
Exemple #6
0
    def query(self, order_ids):
        """
        查询配送单
        :param order_ids: 订单号或者订单号的列表
        :return:
        """
        if not isinstance(order_ids, list):
            raise Exception('parameters error')

        data = Storage()
        data.orderid = json.dumps(order_ids)
        data.app_code = self.app_code
        data.sign = self._gen_sign(data)

        url = 'http://{host}:{port}/marathon/disorder/query'.format(host=self.host, port=self.port)
        resp = requests.get(url, params=data, timeout=self.timeout)
        if resp.status_code != 200:
            return {"respcd": resp.status_code}
        ret = json.loads(resp.text)
        from ..qflogger import log
        log.info("delivery query, url: %s, param: %s, ret: %s" % (url, data, ret))
        return ret
Exemple #7
0
 def __init__(self, *components, **attributes):
     if self.tag[-1:] == '/' and components:
         raise SyntaxError, '<%s> tags cannot have components' % self.tag
     if len(components) == 1 and isinstance(components[0], (list,
             tuple)):
         self.components = list(components[0])
     else:
         self.components = list(components)
     self.attributes = attributes
     self._fixup()
     # converts special attributes in components attributes
     self._postprocessing()
     self.vars = Storage()
     self.errors = Storage()
     self.latest = Storage()
Exemple #8
0
    def update(self, order_id, **kwargs):
        """
        修改配送单
        :param order_id:订单id
        :param kwargs:
        :return:
        """
        if not self._check_params_valid(self.update_param_fields, kwargs):
            raise Exception('parameters error')

        data = Storage()
        data.orderid = order_id
        for key, value in kwargs.iteritems():
            if value:
                data[key] = value
        data.app_code = self.app_code
        data.sign = self._gen_sign(data)

        url = 'http://{host}:{port}/marathon/disorder/edit'.format(host=self.host, port=self.port)
        resp = requests.post(url, data=data, timeout=self.timeout)
        if resp.status_code != 200:
            return {"respcd": resp.status_code}
        ret = json.loads(resp.text)
        return ret
def ssdut_news_list(page_raw):
    ''' parse the news_list page,
    get a list of news, the same squence as the page,

    result.soup
          .page_no
          .news_list
          .total_records
    '''
    result = Storage()
    soup = bsoup(page_raw)
    result.soup = soup

    # get current page number
    r = soup.find(text=ur"\u4e0b\u4e00\u9875")  # text=u"下一页"
    if r:
        '''not the last page'''
        next_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(next_page_link).group(1)
        page_no = int(page_no)  # - 1
    else:
        ''' the last page'''
        r = soup.find(text=ur'\u4e0a\u4e00\u9875')
        prev_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(prev_page_link).group(1)
        page_no = int(page_no)  # + 1
    result.page_no = page_no

    # get the news list
    res = soup.findAll(attrs={"bgcolor": "#EEEEEE"})
    news_list = []
    counter = 1
    for r in res:
        a = r.findChildren("a")
        date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8")
        news_list.append(
            {
                "link": a[0].get("href").encode("utf-8"),
                "title": a[0].text.encode("utf-8"),
                "source": a[1].text.encode("utf-8"),
                "source_link": a[1].get("href").encode("utf-8"),
                "date_str": date_str,
                "date": datetime.date(
                    *[int(n) for n in date_str.split("-")]),
                "no": counter,
            })
        counter += 1
        #logging.debug("source = %s, source_link = %s" %
        #              (news_list[-1]['source'], news_list[-1]['source_link']))
    result.news_list = news_list

    # tital news num
    # 共\d+ t条记录
    s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55"))
    r = re_compile(ur"\u5171(\d+)")
    result.total_records = int(r.search(s).group(1))

    return result
Exemple #10
0
 def forward(self, incoming):
     incoming.conn = conn = Storage()
     conn.init_h = self.initLinearLayer(incoming.hidden.h_n)
Exemple #11
0
class FORM(DIV):

    """
    example:
   
    >>> form = FORM(INPUT(_name=\"test\", requires=IS_NOT_EMPTY()))
    >>> form.xml()
    '<form action=\"\" enctype=\"multipart/form-data\" method=\"post\"><input name=\"test\" /></form>'

    a FORM is container for INPUT, TEXTAREA, SELECT and other helpers

    form has one important method:

        >>> form.accepts(request.vars, session)

    if form is accepted (and all validators pass) form.vars contains the
    accepted vars, otherwise form.errors contains the errors. 
    in case of errors the form is modified to present the errors to the user.
    """

    tag = 'form'

    def __init__(self, *components, **attributes):
        if self.tag[-1:] == '/' and components:
            raise SyntaxError, '<%s> tags cannot have components' % self.tag
        if len(components) == 1 and isinstance(components[0], (list,
                tuple)):
            self.components = list(components[0])
        else:
            self.components = list(components)
        self.attributes = attributes
        self._fixup()
        # converts special attributes in components attributes
        self._postprocessing()
        self.vars = Storage()
        self.errors = Storage()
        self.latest = Storage()

    def accepts(
        self,
        vars,
        session=None,
        formname='default',
        keepvalues=False,
        onvalidation=None,
        ):
        self.errors.clear()
        self.request_vars = Storage()
        self.request_vars.update(vars)
        self.session = session
        self.formname = formname
        self.keepvalues = keepvalues

        # if this tag is a form and we are in accepting mode (status=True)
        # check formname and formkey

        status = True
        if self.session and self.session.get('_formkey[%s]'
                 % self.formname, None) != self.request_vars._formkey:
            status = False
        if self.formname != self.request_vars._formname:
            status = False
        status = self._traverse(status)
        if status and onvalidation:
            onvalidation(self)
        if self.errors:
            status = False
        if session != None:
            self.formkey = session['_formkey[%s]' % formname] = \
                str(uuid.uuid4())
        if status and not keepvalues:
            self._traverse(False)
        return status

    def _postprocessing(self):
        if not '_action' in self.attributes:
            self['_action'] = ''
        if not '_method' in self.attributes:
            self['_method'] = 'post'
        #if not '_enctype' in self.attributes:
        #    self['_enctype'] = 'multipart/form-data'

    def hidden_fields(self):
        c = []
        if 'hidden' in self.attributes:
            for (key, value) in self.attributes.get('hidden',
                    {}).items():
                c.append(INPUT(_type='hidden', _name=key, _value=value))
        if hasattr(self, 'formkey') and self.formkey:
            c.append(INPUT(_type='hidden', _name='_formkey',
                     _value=self.formkey))
        if hasattr(self, 'formname') and self.formname:
            c.append(INPUT(_type='hidden', _name='_formname',
                     _value=self.formname))
        return DIV(c, _class="hidden")

    def xml(self):
        newform = FORM(*self.components, **self.attributes)
        newform.append(self.hidden_fields())
        return DIV.xml(newform)
Exemple #12
0
# coding:utf-8

if __name__ == '__main__':
    import argparse
    import time

    from utils import Storage

    parser = argparse.ArgumentParser(description='A seq2seq model')
    args = Storage()

    parser.add_argument(
        '--name',
        type=str,
        default=None,
        help=
        'The name of your model, used for tensorboard, etc. Default: runXXXXXX_XXXXXX (initialized by current time)'
    )
    parser.add_argument(
        '--restore',
        type=str,
        default=None,
        help=
        'Checkpoints name to load. "last" for last checkpoints, "best" for best checkpoints on dev. Attention: "last" and "best" wiil cause unexpected behaviour when run 2 models in the same dir at the same time. Default: None (don\'t load anything)'
    )
    parser.add_argument('--mode',
                        type=str,
                        default="train",
                        help='"train" or "test". Default: train')
    parser.add_argument('--dataset',
                        type=str,
Exemple #13
0
class HTTPRequest(object):
	def __init__(self, environ, render=None):
		"""Parses the given WSGI environment to construct the request."""
		self.request = Storage()
		self.response = Storage()

		self._render = render
		self.request.method = environ["REQUEST_METHOD"]
		self.request.protocol = environ["wsgi.url_scheme"]
		self.request.remote_ip = environ.get("REMOTE_ADDR", "")
		self.request.path = environ.get("PATH_INFO")
		if environ.get('QUERY_STRING'):
			self.request.query = '?' + environ.get('QUERY_STRING', '')
		else:
			self.request.query = ''
		self.request.fullpath = self.request.path + self.request.query
		self.request.version = "HTTP/1.1"
		self.request.headers = {}
		self.request.headers["Content-Type"] = environ.get("CONTENT_TYPE", "")
		self.request.headers["Content-Length"] = environ.get("CONTENT_LENGTH", "")
		for key in environ:
			if key.startswith("HTTP_"):
				self.request.headers[key[5:].replace("_", "-")] = environ[key]
		if self.request.headers.get("Content-Length"):
			self.request.body = environ["wsgi.input"].read(
				int(self.request.headers["Content-Length"]))
		else:
			self.request.body = ""
		if environ.get("HTTP_HOST"):
			self.request.host = environ["HTTP_HOST"]
		else:
			self.request.host = environ["SERVER_NAME"]

		self.response.status = "200 OK"
		#self.response.headers = {"Content-Type":"text/html; charset=UTF-8"}
		#如果设置了Content-Type,则必须设置Content-Length,设置了length,可不必设置type
		#如果不设置Content-Length,就容易出现[Errno 32]Broken pipe
		self.response.headers = []
		#self.response.headers = [("Content-Type","text/html; charset=UTF-8")]
		self.request.arguments = self.rawinput()
		self.initialize()

	def initialize(self):
		pass

	def supports_http_1_1(self):
		"""Returns True if this request supports HTTP/1.1 semantics"""
		return self.request.version == "HTTP/1.1"

	def rawinput(self, method="both"):
		method = method.lower()
		urlparams = urlparse.parse_qs(self.request.query[1:])
		if method == "url":
			return urlparams
		bodyparams = self._parse_body_arguments()
		if method == "body":
			return bodyparams or {}
		if bodyparams:
			for k,v in bodyparams.items():
				if k in urlparams:
					urlparams[k] += v
				else:
					urlparams[k] = v
		return urlparams

	def _parse_body_arguments(self):
		if self.request.headers['Content-Type'] == "application/x-www-form-urlencoded":
			return urlparse.parse_qs(self.request.body)
		elif self.request.headers['Content-Type'] in ("multipart/form-data", "application/octet-stream"):
			#去除开头和结尾的冗余字符
			tmpIndex = self.request.body.find('Content-Type')
			startIndex = 0
			if tmpIndex != -1:
				startIndex = self.request.body.find('\n', tmpIndex)+3

			endIndex = len(self.request.body)-2
			while True:
				if self.request.body[endIndex] == '\n': break
				else: endIndex -= 1
			#截取真实内容
			self.request.body = self.request.body[startIndex:endIndex-1]
			self.request.size = endIndex-startIndex-1

	def HEAD(self, *args, **kwargs):
		return NoMethod(self)

	def GET(self, *args, **kwargs):
		return NoMethod(self)

	def POST(self, *args, **kwargs):
		return NoMethod(self)

	def DELETE(self, *args, **kwargs):
		return NoMethod(self)

	def PATCH(self, *args, **kwargs):
		return NoMethod(self)

	def PUT(self, *args, **kwargs):
		return NoMethod(self)

	def OPTIONS(self, *args, **kwargs):
		return NoMethod(self)

	def set_header(self, name, value):
		for x in self.response.headers:
			if x[0] == name:
				self.response.headers.remove(x)
				break
		self.response.headers.append((name, value))

	def get_header(self, name):
		for x in self.response.headers:
			if x[0] == name:
				return x[1]
		return None

	def clear_header(self, name):
		for x in self.response.headers:
			if x[0] == name:
				self.response.headers.remove(x)
				break

	def cookies(self):
		"""A dictionary of Cookie.Morsel objects."""
		if not hasattr(self.request, "_cookies"):
			if "COOKIE" in self.request.headers:
				c = self.request.headers['COOKIE']
				if '"' in c:
					cookie = Cookie.SimpleCookie()
					try:
						cookie.load(c)
						self.request._cookies = dict((k, urllib.unquote(v.value)) for k, v in cookie.iteritems())
					except Exception:
						self.request._cookies = {}
				else:
					self.request._cookies = {}
					for key_value in c.split(';'):
						key_value = key_value.split('=', 1)
						if len(key_value) == 2:
							key, value = key_value
							self.request._cookies[key.strip()] = urllib.unquote(value.strip())
			else:
				self.request._cookies = {}
		return self.request._cookies

	def get_cookie(self, name, value=None):
		return self.cookies().get(name, value)

	def set_cookie(self, name, value, expires='', domain=None,
              secure=False, httponly=False, path="/"):
		morsel = Cookie.Morsel()
		morsel.set(name, value, urllib.quote(value))
		if expires < 0:
			expires = -1000000000
		morsel['expires'] = int(expires)
		morsel['path'] = path
		if domain:
			morsel['domain'] = domain
		if secure:
			morsel['secure'] = secure
		value = morsel.OutputString()
		if httponly:
			value += '; httponly'
		self.response.headers.append(("Set-Cookie", value))

	def clear_cookie(self, name, path="/", domain=None):
		self.set_cookie(name, value="", path=path, domain=domain)

	def render(self, filename, **kw):
		self.response.headers = [("Content-Type","text/html; charset=UTF-8")]
		return getattr(self._render, filename)(**kw)

	def redirect(self, url):
		newloc = urlparse.urljoin(self.request.path, url)
		if url.startswith('/'):
			newloc = self.request.host + newloc

		self.response.status = "303 See Other"
		self.response.headers = {'Content-Type': 'text/html', 'Location': newloc}
	
	def cleanup(self):
		self.request.clear()
		self.response.clear()
Exemple #14
0
    def create(self, order_id, regionid, customer_address, customer_name, customer_mobile,
               mchnt_address, mchnt_name, mchnt_mobile, order_time, limit_time, order_type=1):
        """
        创建配送单
        :param order_id:订单id
        :param order_type:订单类型,1:普通订单, 2:内测订单, 选传(默认为1)
        :param regionid:商圈ID
        :param customer_address:客户地址
        :param customer_name:客户名
        :param customer_mobile:客户手机号码
        :param mchnt_address:商家地址
        :param mchnt_name:商家名称
        :param mchnt_mobile:商家号码
        :param order_time:下单时间
        :param limit_time:送达时间
        :return:
        """
        data = Storage()
        data.orderid = order_id
        data.type = order_type
        data.regionid = regionid
        data.customer_address = customer_address
        data.customer_name = customer_name
        data.customer_mobile = customer_mobile
        data.mchnt_address = mchnt_address
        data.mchnt_name = mchnt_name
        data.mchnt_mobile = mchnt_mobile
        data.order_time = order_time
        data.limit_time = limit_time
        data.goods_info = json.dumps(self.goods)
        data.app_code = self.app_code
        data.sign = self._gen_sign(data)

        url = 'http://{host}:{port}/marathon/disorder/create'.format(host=self.host, port=self.port)
        resp = requests.post(url, data=data, timeout=self.timeout)
        if resp.status_code != 200:
            return {"respcd": resp.status_code}
        ret = json.loads(resp.text)
        return ret
Exemple #15
0
# coding:utf-8

if __name__ == '__main__':
    import argparse
    import time

    from utils import Storage

    parser = argparse.ArgumentParser(description='A hred model')
    args = Storage()

    parser.add_argument(
        '--name',
        type=str,
        default='hred',
        help=
        'The name of your model, used for variable scope and tensorboard, etc. Default: runXXXXXX_XXXXXX (initialized by current time)'
    )
    parser.add_argument(
        '--restore',
        type=str,
        default='last',
        help=
        'Checkpoints name to load. "last" for last checkpoints, "best" for best checkpoints on dev. Attention: "last" and "best" wiil cause unexpected behaviour when run 2 models in the same dir at the same time. Default: None (don\'t load anything)'
    )
    parser.add_argument('--mode',
                        type=str,
                        default="train",
                        help='"train" or "test". Default: train')
    parser.add_argument('--dataset',
                        type=str,
Exemple #16
0
def parseli(soup, raw=False):
    """Parse LinkedIn: Scrapes, scrubs, and returns a dictionary of
    key LinkedIn data

    # TODO: Extend profile to include interests + Viewers 
    """
    profile = Storage({            
            "id": '',
            "avatar": '',
            "url": '',
            "name": {},
            "location": {},
            "headline": '',
            "industry": '',
            "viewers": [],
            "employment": [],
            "education": [],
            "connections": '',
            "summary": '',
            })

    def meta(profile):
        """gets metadata like unique_id, and profile url"""
        jstxt = str(soup.findAll('script'))

        def get_id(x):
            try:
                start_id = x.index("newTrkInfo = '") + 14
                end_id = x.index(',', start_id)
                return x[start_id:end_id]
            except:
                try:
                    start_id = x.index("user_id: ")
                    end_id = x.index(',', start_id)
                    return x[start_id:end_id]
                except:
                    member_id = soup.findAll('div', {'class': 'masthead'})
                    if member_id:
                        return member_id[0]['id']
            return ''

        liid = get_id(jstxt)

        def get_url():
            canonical_url = soup.findAll('link', {'rel': 'canonical'})
            return canonical_url[0]['href'] if canonical_url \
                else PROFILE_URL.format(liid)

        profile.id = liid
        profile.url = get_url()
        return profile

    def header(profile):
        """Parses the profile-header section

        +------------------------------------------+
        | +-------+  given_name family_name
        | |       |  title [at institution]
        | |  pic  |  locality [(area)] | Industry
        | |       |
        | +-------+
        """
        header_sec = soup.findAll('div', {'class': 'profile-header'})

        if header_sec:
            header_sec = header_sec[0]
            
            avatar = header_sec.findAll('div', {'id': 'profile-picture'})
            if avatar:
                profile.avatar = avatar[0].findAll('img')[0]['src']
            demographic = soup.findAll('dl', {"class": 'demographic-info adr'})
            name = header_sec.findAll('span', {"class": "full-name"})            
            headline = header_sec.findAll("p", {"class": "headline-title title"})

            # Generally headline is of the form: "Title at Institution"
            if headline:                
                profile.headline = headline[0].text
                if not profile.employment:
                    if ' at ' in profile.headline:
                        try:
                            title, institution = profile.headline.split(' at ')
                            profile["employment"].append({"institution": institution, "title": title})
                        except:
                            pass

            if name:
                given_name = name[0].findAll('span', {'class': 'given-name'})
                family_name = name[0].findAll('span', {'class': 'family-name'})

                profile.name.update({
                        'given-name': given_name[0].text if given_name else '',
                        'family-name': family_name[0].text if family_name else ''
                        })

            # Fetch industry, location + area from header section
            if demographic:
                demos = demographic[0].findAll('dd')
                if demos:
                    if len(demos) == 2:
                        industry = demos[1].text
                        profile.industry = industry
                    try:
                        location, area = demos[0].text.replace(")", "").split("(")
                    except:
                        location, area = demos[0].text, ""
                    profile.location = {"locality": location, "area": area}

        return profile

    def overview(profile):
        """Parses the "Overview" section: The overview is used as a
        last resort to fill in any missing information which could not
        be obtained by the 'experience' (employment) and 'education'
        sections. The quality of information it provides is inferior
        to the aforementioned.

        given_name family_name's Overview
        ---------------------------------
                Current  title at institution <0 or n>
                   Past  title at institution <0 or n>
              Education  institution <0 or n>
        """
        overview_sec = soup.findAll('dl', {'id': 'overview'})
        if overview_sec:
            if not profile.employment:
                career_selectors = [\
                    overview_sec[0].findAll('div', {'class': 'summary-current'}),
                    overview_sec[0].findAll('div', {'class': 'summary-past'}),
                    overview_sec[0].findAll('div', {'class': 'past'})
                    ]
                # prune any selector which returns no results, i.e. [], are not lists
                career_lsts = filter(lambda x: type(x) is list, career_selectors)

                # if career_lsts contains any non empty lists
                if any(career_lsts):
                    # reduce on list concat
                    careers = reduce(add, [lst[0] for lst in career_lsts])
                    for career in careers:
                        title, institution = str(career)[4:-5]\
                            .replace("\n", "").split('<span class="at">at </span>')
                        profile["employment"].append({"institution": institution, "title": title})

            if not profile.education:
                edu_subsec = overview_sec[0].findAll('dd', {'class': 'summary-education'})
                if edu_subsec:
                    edus = edu_subsec[0].findAll('li')
                    for edu in edus:
                        profile['education'].append({'summary': edu.text})
        return profile

    def employment(profile):
        """Parses the "Experience" section

        Notes:
        either dtstatus or dtend is present (exactly one of them)
        dtstamp signified 'Present' employee
        dtstamp is resolved to a binary value (1/0) for profile.current

        given_name family_name's Experience
        -----------------------------------
        # employers <1 to n>
        title
        institution
        dtstart - [dtstamp|dtend] | location         
        """
        jobs = soup.findAll('div', {'id': 'profile-experience'})

        # If profile "Experience Section" exists
        if jobs:
            jobs = jobs[0]
            careers = jobs.findAll('div', {'class': EMPLOY_SEC_CLS.format("first", "current")}) + \
                jobs.findAll('div', {'class': EMPLOY_SEC_CLS.format('', 'current')}) + \
                jobs.findAll('div', {'class': EMPLOY_SEC_CLS.format('', 'past')})

            for career in careers:
                title = career.findAll("span", {'class': 'title'})
                institution = career.findAll("span", {'class': 'org summary'})
                location = career.findAll("span", {'class': 'location'})
                description = career.findAll("p", {'class': ' description past-position'})
                dtstart = career.findAll('abbr', {'class': "dtstart"})
                dtstamp = career.findAll('abbr', {'class': "dtstamp"})
                dtend = career.findAll('abbr', {'class': "dtend"})
                job = {"title": title[0].text if title else '',
                       "institution": institution[0].text if institution else '',
                       "current": 1 if dtstamp else 0,
                       "location": location[0].text if location else '',
                       "description": description[0].text if description else '',
                       "date": {
                        "start": dtstart[0]['title'] if dtstart else '',
                        "end": dtend[0]['title'] if dtend else ''
                        }
                       }
                profile["employment"].append(job)
        return profile

    def education(profile):
        """Parses the "Education" section"""        
        section_edu = soup.findAll('div', {'id': 'profile-education'})
        if section_edu:
            section_edu = section_edu[0]
            edus = section_edu.findAll("div", {"class": EDU_SEC_CLS.format(' first')}) + \
                section_edu.findAll("div", {"class": EDU_SEC_CLS.format('')})  
            for school in edus:
                institution = school.findAll("h3")
                degree = school.findAll('span', {'class': 'degree'})
                major = school.findAll('span', {'class': 'major'})
                dtstart = school.findAll('abbr', {'class': "dtstart"})
                dtend = school.findAll('abbr', {'class': "dtend"})
                edu = {"institution": institution[0].text if institution else '',
                       "degree": degree[0].text if degree else '',
                       "major": major[0].text if major else '',
                       "dtstart": dtstart[0]['title'] if dtstart else '',
                       "dtend": dtend[0]['title'] if dtend else ''
                       }
                profile["education"].append(edu)
        return profile

    def conns(profile):
        """User's network size"""
        cs = soup.findAll('dd', {'class': 'overview-connections'})
        if cs:
            profile['connections'] = cs[0].findAll('strong')[0].text
        return profile
    
    def summary(profile):
        summary_sec = soup.findAll('div', {'id': 'profile-summary'})
        if summary_sec:
            summary_sec = summary_sec[0]
            summary_content = summary_sec.findAll('p', {"class": " description summary"})           
            if summary_content:
                profile.summary = summary_content[0].text                
        return profile
        
    def similar(profile):
        """Returns a list of similar profile urls, if they exist"""
        try:
            ppl = soup.findAll('div', {'id': 'extra'})[0].findAll('a')
            profile['similar'] = list(set([a['href'] for a in ppl]))
        except:
            pass
        return profile

    def techtags(profile):
        """Adds tech tags if they exist"""
        tags = soup.findAll('ol', {'id': 'skills-list'})
        if tags:
            profile['skills'] = [li.text for li in tags[0].findAll('li')]
        return profile

    def interests(profile):
        """Estimate interests based on groups / affiliations"""
        groups = soup.findAll('dd', {'id': 'pubgroups'})
        if groups:
            interests = [i.text for i in groups[0].findAll('li')]
            profile['interests'] = interests
        return profile
        
    profile = summary(similar(interests(techtags(conns(header(overview(
                        education(employment(meta(profile))))))))))
    return profile if not raw else json.dumps(profile)
def ssdut_news_parse(raw):
    ''' parse the raw page src,

    store all result in a Storage object.
    all strings are unicode

    result.soup
        BeautifulSoup object
    result.raw
        raw page src
    result.hash
        sha1 hash of the page
    result.title
        title
    result.source
        来源
    result.date_str - date in string
    result.date - date object
    result.body
        html src of the news body
    result.clean_body
        unescaped src of the news body,
    result.publisher
        发表人
    '''
    soup = bsoup(raw)
    result = Storage()

    # raw page / hash
    result.raw = raw
    result.soup = soup

    # title
    s = soup.find(attrs={'class': re_compile('title')})
    result.title = s.text

    # source
    text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn"))
    r = re_compile(
        ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:")
    res = r.findall(text)[0]
    result.source = res[1].rstrip()

    # date
    result.date_str = res[0]
    result.date = datetime.date(*[int(n) for n in result.date_str.split('-')])

    # content (body)
    c = soup.find(attrs={'class': re_compile('content')})
    result.body = unicode(c)

    # content (body)  unescaped
    texts = c.findAll(text=True)
    all_texts = '\n'.join(texts)
    result.clean_body = html_parser.unescape(all_texts)

    # publisher (could be find at the bottom of page)
    s = soup.find(
        attrs={
            "style": "font-size:14px;float:left;text-align:right;width:80%"
        })
    r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)")
    #logging.debug("publisher string = %r " % s)

    try:
        name = r.findall(s.text)[0]
    except:
        logging.warn(" %s has no publisher " % result.title)
        name = ""  # no publisher: like this: index.php/News/8692.html
    result.publisher = name.rstrip().lstrip()

    # use utf-8 encoding
    for k in ['title', 'source', 'body', 'clean_body', 'publisher']:
        result[k] = result[k].encode('utf-8')


    hash_src = result.body + result.title + result.publisher
    if isinstance(hash_src, str):
        hash_src = unicode(hash_src, "utf-8", "ignore")
    elif isinstance(hash_src, unicode):
        pass
    else:
        pass
    result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest()
    result.search_text = ''.join([result.title, result.source,
                                  result.clean_body, result.publisher,
                                  result.sha1])
    return result
Exemple #18
0
def main(args, load_exclude_set, restoreCallback):
    logging.basicConfig(\
     filename=0,\
     level=logging.DEBUG,\
     format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\
     datefmt='%H:%M:%S')

    if args.debug:
        debug()
    logging.info(json.dumps(args, indent=2))

    cuda_init(0, args.cuda)

    volatile = Storage()
    volatile.load_exclude_set = load_exclude_set
    volatile.restoreCallback = restoreCallback

    data_class = BERTLanguageProcessingBase.load_class('BERT' + args.dataset)
    data_arg = Storage()
    data_arg.file_id = args.datapath
    data_arg.bert_vocab = args.bert_vocab
    wordvec_class = WordVector.load_class(args.wvclass)
    if wordvec_class is None:
        wordvec_class = Glove

    def load_dataset(data_arg, wvpath, embedding_size):
        wv = wordvec_class(wvpath)
        dm = data_class(**data_arg)
        return dm, wv.load(embedding_size, dm.vocab_list)

    if args.cache:
        dm, volatile.wordvec = try_cache(
            load_dataset, (data_arg, args.wvpath, args.embedding_size),
            args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__)
    else:
        dm, volatile.wordvec = load_dataset(data_arg, args.wvpath,
                                            args.embedding_size)

    volatile.dm = dm

    param = Storage()
    param.args = args
    param.volatile = volatile

    model = Seq2seq(param)
    if args.mode == "train":
        model.train_process()
    elif args.mode == "test":
        model.test_process()
    else:
        raise ValueError("Unknown mode")
Exemple #19
0
import os
import logging
from models import Config
from utils import Storage

__all__ = [ 'config', 'settings', ]

logging.info('module config reloaded')

settings = Storage()

config = Config.get_by_key_name('default')

if not config:
	config = Config(key_name = 'default')
	config.put()

if not config.app_root:
	settings.app_root = ''
else:
	settings.app_root = '/' + config.app_root.strip('/ ')

settings.home_page = settings.app_root + '/'

    def forward(self,
                inp,
                wLinearLayerCallback,
                h_init=None,
                mode='max',
                input_callback=None,
                no_unk=True,
                top_k=10):
        """
        inp contains: batch_size, dm, embLayer, embedding, sampling_proba, max_sent_length, post, post_length, resp_length [init_h]
        input_callback(i, embedding):   if you want to change word embedding at pos i, override this function
        nextStep(embedding, flag):  pass embedding to RNN and get gru_h, flag indicates i th sentence is end when flag[i]==1
        wLinearLayerCallback(gru_h): input gru_h and give a probability distribution on vocablist

        output: w_o emb length"""
        nextStep, h_now, context = self.init_forward_all(inp.batch_size,
                                                         inp.post,
                                                         inp.post_length,
                                                         h_init=inp.get(
                                                             "init_h", None))

        gen = Storage()
        gen.w_pro = []
        batch_size = inp.embedding.shape[1]
        seqlen = inp.embedding.shape[0]
        length = inp.resp_length - 1
        start_id = inp.dm.go_id if no_unk else 0

        attn_weights = []
        first_emb = inp.embLayer(LongTensor([inp.dm.go_id
                                             ])).repeat(inp.batch_size, 1)
        next_emb = first_emb

        if input_callback:
            inp.embedding = input_callback(inp.embedding)

        for i in range(seqlen):
            proba = random()

            # Sampling
            if proba < inp.sampling_proba:
                now = next_emb
                if input_callback:
                    now = input_callback(now)
            # Teacher Forcing
            else:
                now = inp.embedding[i]

            if self.gru_input_attn:
                h_now = self.cell_forward(torch.cat([now, context], last_dim=-1), h_now) \
                    * Tensor((length > np.ones(batch_size) * i).astype(float)).unsqueeze(-1)
            else:
                h_now = self.cell_forward(now, h_now) \
                    * Tensor((length > np.ones(batch_size) * i).astype(float)).unsqueeze(-1)

            query = self.attn_query(h_now)
            attn_weight = maskedSoftmax(
                (query.unsqueeze(0) * inp.post).sum(-1), inp.post_length)
            context = (attn_weight.unsqueeze(-1) * inp.post).sum(0)

            gru_h = torch.cat([h_now, context], dim=-1)
            attn_weights.append(attn_weight)

            w = wLinearLayerCallback(gru_h)
            gen.w_pro.append(w)

            # Decoding
            if mode == "max":
                w = torch.argmax(w[:, start_id:], dim=1) + start_id
                next_emb = inp.embLayer(w)
            elif mode == "gumbel" or mode == "sample":
                w_onehot = gumbel_max(w[:, start_id:])
                w = torch.argmax(w_onehot, dim=1) + start_id
                next_emb = torch.sum(
                    torch.unsqueeze(w_onehot, -1) *
                    inp.embLayer.weight[start_id:], 1)
            elif mode == "samplek":
                _, index = w[:,
                             start_id:].topk(top_k,
                                             dim=-1,
                                             largest=True,
                                             sorted=True)  # batch_size, top_k
                mask = torch.zeros_like(w[:,
                                          start_id:]).scatter_(-1, index, 1.0)
                w_onehot = gumbel_max_with_mask(w[:, start_id:], mask)
                w = torch.argmax(w_onehot, dim=1) + start_id
                next_emb = torch.sum(
                    torch.unsqueeze(w_onehot, -1) *
                    inp.embLayer.weight[start_id:], 1)
            else:
                raise AttributeError(
                    "The given mode {} is not recognized.".format(mode))

        gen.w_pro = torch.stack(gen.w_pro, dim=0)

        return gen
Exemple #21
0
        self.seq = [("alternate_ip", IPField)]

        BaseMessage.__init__(self, *args, **kwargs)


MessageTypes = Storage(
    {
        "\x00\x01": ShortResponse,
        "\x00\x02": LoginRequest,
        "\x00\x03": LoginReply,
        "\x00\x04": AlternateServerMessage,
        "\x00\x05": Logout,
        "\x00\x06": KeepAlive,
        "\x00\x07": KeepAliveAck,
        "\x00\x10": ClientInvite,
        "\x00\x11": ServerRejectInvite,
        "\x00\x12": ServerForwardInvite,
        "\x00\x13": ClientInviteAck,
        "\x00\x14": ServerForwardRing,
        "\x00\x15": ClientAnswer,
        "\x00\x20": ClientRTP,
        "\x00\x40": HangupRequest,
        "\x00\x41": HangupRequestAck,
        "\x00\xa0": ServerOverloaded,
    }
)


def keyof(_v):
    for k, v in MessageTypes.iteritems():
        if _v == v or isinstance(_v, v):
Exemple #22
0
def run(*argv):
    import argparse
    import time

    from utils import Storage

    parser = argparse.ArgumentParser(description='A hred model')
    args = Storage()

    parser.add_argument(
        '--name',
        type=str,
        default='hred',
        help=
        'The name of your model, used for variable scope and tensorboard, etc. Default: runXXXXXX_XXXXXX (initialized by current time)'
    )
    parser.add_argument(
        '--restore',
        type=str,
        default='best',
        help=
        'Checkpoints name to load. "last" for last checkpoints, "best" for best checkpoints on dev. Attention: "last" and "best" wiil cause unexpected behaviour when run 2 models in the same dir at the same time. Default: None (don\'t load anything)'
    )
    parser.add_argument('--mode',
                        type=str,
                        default="train",
                        help='"train" or "test". Default: train')
    parser.add_argument('--dataset',
                        type=str,
                        default='MyMemHRED',
                        help='Dataloader class. Default: UbuntuCorpus')
    parser.add_argument('--datapath',
                        type=str,
                        default='../data/film',
                        help='Directory for data set. Default: UbuntuCorpus')
    parser.add_argument('--epoch',
                        type=int,
                        default=20,
                        help="Epoch for trainning. Default: 100")
    parser.add_argument(
        '--wvclass',
        type=str,
        default='TencentChinese',
        help=
        "Wordvector class, none for not using pretrained wordvec. Default: Glove"
    )
    parser.add_argument(
        '--wvpath',
        type=str,
        default="wordvector/chinese",
        help=
        "Directory for pretrained wordvector. Default: resources://Glove300d")

    parser.add_argument(
        '--out_dir',
        type=str,
        default="./output/film",
        help='Output directory for test output. Default: ./output')
    parser.add_argument(
        '--log_dir',
        type=str,
        default="./tensorboard/film",
        help='Log directory for tensorboard. Default: ./tensorboard')
    parser.add_argument(
        '--model_dir',
        type=str,
        default="./model/film",
        help='Checkpoints directory for model. Default: ./model')
    parser.add_argument(
        '--cache_dir',
        type=str,
        default="./cache/film",
        help='Checkpoints directory for cache. Default: ./cache')
    parser.add_argument('--cpu', action="store_true", help='Use cpu.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Enter debug mode (using ptvsd).')
    parser.add_argument(
        '--cache',
        action='store_true',
        help=
        'Use cache for speeding up load data and wordvec. (It may cause problems when you switch dataset.)'
    )
    cargs = parser.parse_args(argv)

    # Editing following arguments to bypass command line.
    args.name = cargs.name or time.strftime("run%Y%m%d_%H%M%S",
                                            time.localtime())
    args.restore = cargs.restore
    args.mode = cargs.mode
    args.dataset = cargs.dataset
    args.datapath = cargs.datapath
    args.epochs = cargs.epoch
    args.wvclass = cargs.wvclass
    args.wvpath = cargs.wvpath
    args.out_dir = cargs.out_dir
    args.log_dir = cargs.log_dir
    args.model_dir = cargs.model_dir
    args.cache_dir = cargs.cache_dir
    args.debug = cargs.debug
    args.cache = cargs.cache
    args.cuda = not cargs.cpu

    args.softmax_samples = 512
    args.embedding_size = 200
    args.eh_size = 200
    args.ch_size = 200
    args.dh_size = 200
    args.lr = 1e-3
    args.lr_decay = 0.99
    args.batch_size = 32
    args.grad_clip = 5.0
    args.show_sample = [0]
    args.max_sent_length = 50
    args.checkpoint_steps = 100
    args.checkpoint_max_to_keep = 5

    import random
    random.seed(0)

    from main import main

    main(args)