def add_good(self, goods_name, goods_content='', goods_num=1): """ 添加商品 :param goods_num:商品数量 :param goods_name:商品名 :param goods_content:商品备注 :return: """ data = Storage() data.goods_num = goods_num data.goods_name = goods_name data.goods_content = goods_content self.goods.append(data)
def get_sessionid(self): """ 获取sessionid :return: """ data = Storage() data.username = 15201287981 data.password = "******" url = 'http://{host}:{port}/marathon/user/login'.format(host=self.host, port=self.port) resp = requests.post(url, params=data, timeout=self.timeout) if resp.status_code != 200: return {"respcd": resp.status_code} cookies = {'sessionid':json.loads(resp.text)['data']['sessionid']} return cookies
def accepts( self, vars, session=None, formname='default', keepvalues=False, onvalidation=None, ): self.errors.clear() self.request_vars = Storage() self.request_vars.update(vars) self.session = session self.formname = formname self.keepvalues = keepvalues # if this tag is a form and we are in accepting mode (status=True) # check formname and formkey status = True if self.session and self.session.get('_formkey[%s]' % self.formname, None) != self.request_vars._formkey: status = False if self.formname != self.request_vars._formname: status = False status = self._traverse(status) if status and onvalidation: onvalidation(self) if self.errors: status = False if session != None: self.formkey = session['_formkey[%s]' % formname] = \ str(uuid.uuid4()) if status and not keepvalues: self._traverse(False) return status
def load_tests(self): """ Iterate through the plugoos inside the folder specified by the config file and instantiate them. """ pluginfiles = [fname[:-3] for fname in os.listdir(self.config.main.testdir)\ if fname.endswith(".py")] for fname in pluginfiles: test = Storage() test_name = fname if not self.config.main.testdir in sys.path: sys.path.insert(0, self.config.main.testdir) #print "Fname: %s\n__import__(%s)" % (fname, fname) #print sys.path module = __import__(fname) try: test.name = module.__plugoo__ test.desc = module.__desc__ test.module = module except Exception, e: self.logger.warning("Soft fail %s", e) test.name = test_name test.desc = "" test.module = module try: self.tests[test_name] = test except Exception, e: print "Failed to load the test %s %s" % (name, e)
def __init__(self, environ, render=None): """Parses the given WSGI environment to construct the request.""" self.request = Storage() self.response = Storage() self._render = render self.request.method = environ["REQUEST_METHOD"] self.request.protocol = environ["wsgi.url_scheme"] self.request.remote_ip = environ.get("REMOTE_ADDR", "") self.request.path = environ.get("PATH_INFO") if environ.get('QUERY_STRING'): self.request.query = '?' + environ.get('QUERY_STRING', '') else: self.request.query = '' self.request.fullpath = self.request.path + self.request.query self.request.version = "HTTP/1.1" self.request.headers = {} self.request.headers["Content-Type"] = environ.get("CONTENT_TYPE", "") self.request.headers["Content-Length"] = environ.get("CONTENT_LENGTH", "") for key in environ: if key.startswith("HTTP_"): self.request.headers[key[5:].replace("_", "-")] = environ[key] if self.request.headers.get("Content-Length"): self.request.body = environ["wsgi.input"].read( int(self.request.headers["Content-Length"])) else: self.request.body = "" if environ.get("HTTP_HOST"): self.request.host = environ["HTTP_HOST"] else: self.request.host = environ["SERVER_NAME"] self.response.status = "200 OK" #self.response.headers = {"Content-Type":"text/html; charset=UTF-8"} #如果设置了Content-Type,则必须设置Content-Length,设置了length,可不必设置type #如果不设置Content-Length,就容易出现[Errno 32]Broken pipe self.response.headers = [] #self.response.headers = [("Content-Type","text/html; charset=UTF-8")] self.request.arguments = self.rawinput() self.initialize()
def query(self, order_ids): """ 查询配送单 :param order_ids: 订单号或者订单号的列表 :return: """ if not isinstance(order_ids, list): raise Exception('parameters error') data = Storage() data.orderid = json.dumps(order_ids) data.app_code = self.app_code data.sign = self._gen_sign(data) url = 'http://{host}:{port}/marathon/disorder/query'.format(host=self.host, port=self.port) resp = requests.get(url, params=data, timeout=self.timeout) if resp.status_code != 200: return {"respcd": resp.status_code} ret = json.loads(resp.text) from ..qflogger import log log.info("delivery query, url: %s, param: %s, ret: %s" % (url, data, ret)) return ret
def __init__(self, *components, **attributes): if self.tag[-1:] == '/' and components: raise SyntaxError, '<%s> tags cannot have components' % self.tag if len(components) == 1 and isinstance(components[0], (list, tuple)): self.components = list(components[0]) else: self.components = list(components) self.attributes = attributes self._fixup() # converts special attributes in components attributes self._postprocessing() self.vars = Storage() self.errors = Storage() self.latest = Storage()
def update(self, order_id, **kwargs): """ 修改配送单 :param order_id:订单id :param kwargs: :return: """ if not self._check_params_valid(self.update_param_fields, kwargs): raise Exception('parameters error') data = Storage() data.orderid = order_id for key, value in kwargs.iteritems(): if value: data[key] = value data.app_code = self.app_code data.sign = self._gen_sign(data) url = 'http://{host}:{port}/marathon/disorder/edit'.format(host=self.host, port=self.port) resp = requests.post(url, data=data, timeout=self.timeout) if resp.status_code != 200: return {"respcd": resp.status_code} ret = json.loads(resp.text) return ret
def ssdut_news_list(page_raw): ''' parse the news_list page, get a list of news, the same squence as the page, result.soup .page_no .news_list .total_records ''' result = Storage() soup = bsoup(page_raw) result.soup = soup # get current page number r = soup.find(text=ur"\u4e0b\u4e00\u9875") # text=u"下一页" if r: '''not the last page''' next_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(next_page_link).group(1) page_no = int(page_no) # - 1 else: ''' the last page''' r = soup.find(text=ur'\u4e0a\u4e00\u9875') prev_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(prev_page_link).group(1) page_no = int(page_no) # + 1 result.page_no = page_no # get the news list res = soup.findAll(attrs={"bgcolor": "#EEEEEE"}) news_list = [] counter = 1 for r in res: a = r.findChildren("a") date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8") news_list.append( { "link": a[0].get("href").encode("utf-8"), "title": a[0].text.encode("utf-8"), "source": a[1].text.encode("utf-8"), "source_link": a[1].get("href").encode("utf-8"), "date_str": date_str, "date": datetime.date( *[int(n) for n in date_str.split("-")]), "no": counter, }) counter += 1 #logging.debug("source = %s, source_link = %s" % # (news_list[-1]['source'], news_list[-1]['source_link'])) result.news_list = news_list # tital news num # 共\d+ t条记录 s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55")) r = re_compile(ur"\u5171(\d+)") result.total_records = int(r.search(s).group(1)) return result
def forward(self, incoming): incoming.conn = conn = Storage() conn.init_h = self.initLinearLayer(incoming.hidden.h_n)
class FORM(DIV): """ example: >>> form = FORM(INPUT(_name=\"test\", requires=IS_NOT_EMPTY())) >>> form.xml() '<form action=\"\" enctype=\"multipart/form-data\" method=\"post\"><input name=\"test\" /></form>' a FORM is container for INPUT, TEXTAREA, SELECT and other helpers form has one important method: >>> form.accepts(request.vars, session) if form is accepted (and all validators pass) form.vars contains the accepted vars, otherwise form.errors contains the errors. in case of errors the form is modified to present the errors to the user. """ tag = 'form' def __init__(self, *components, **attributes): if self.tag[-1:] == '/' and components: raise SyntaxError, '<%s> tags cannot have components' % self.tag if len(components) == 1 and isinstance(components[0], (list, tuple)): self.components = list(components[0]) else: self.components = list(components) self.attributes = attributes self._fixup() # converts special attributes in components attributes self._postprocessing() self.vars = Storage() self.errors = Storage() self.latest = Storage() def accepts( self, vars, session=None, formname='default', keepvalues=False, onvalidation=None, ): self.errors.clear() self.request_vars = Storage() self.request_vars.update(vars) self.session = session self.formname = formname self.keepvalues = keepvalues # if this tag is a form and we are in accepting mode (status=True) # check formname and formkey status = True if self.session and self.session.get('_formkey[%s]' % self.formname, None) != self.request_vars._formkey: status = False if self.formname != self.request_vars._formname: status = False status = self._traverse(status) if status and onvalidation: onvalidation(self) if self.errors: status = False if session != None: self.formkey = session['_formkey[%s]' % formname] = \ str(uuid.uuid4()) if status and not keepvalues: self._traverse(False) return status def _postprocessing(self): if not '_action' in self.attributes: self['_action'] = '' if not '_method' in self.attributes: self['_method'] = 'post' #if not '_enctype' in self.attributes: # self['_enctype'] = 'multipart/form-data' def hidden_fields(self): c = [] if 'hidden' in self.attributes: for (key, value) in self.attributes.get('hidden', {}).items(): c.append(INPUT(_type='hidden', _name=key, _value=value)) if hasattr(self, 'formkey') and self.formkey: c.append(INPUT(_type='hidden', _name='_formkey', _value=self.formkey)) if hasattr(self, 'formname') and self.formname: c.append(INPUT(_type='hidden', _name='_formname', _value=self.formname)) return DIV(c, _class="hidden") def xml(self): newform = FORM(*self.components, **self.attributes) newform.append(self.hidden_fields()) return DIV.xml(newform)
# coding:utf-8 if __name__ == '__main__': import argparse import time from utils import Storage parser = argparse.ArgumentParser(description='A seq2seq model') args = Storage() parser.add_argument( '--name', type=str, default=None, help= 'The name of your model, used for tensorboard, etc. Default: runXXXXXX_XXXXXX (initialized by current time)' ) parser.add_argument( '--restore', type=str, default=None, help= 'Checkpoints name to load. "last" for last checkpoints, "best" for best checkpoints on dev. Attention: "last" and "best" wiil cause unexpected behaviour when run 2 models in the same dir at the same time. Default: None (don\'t load anything)' ) parser.add_argument('--mode', type=str, default="train", help='"train" or "test". Default: train') parser.add_argument('--dataset', type=str,
class HTTPRequest(object): def __init__(self, environ, render=None): """Parses the given WSGI environment to construct the request.""" self.request = Storage() self.response = Storage() self._render = render self.request.method = environ["REQUEST_METHOD"] self.request.protocol = environ["wsgi.url_scheme"] self.request.remote_ip = environ.get("REMOTE_ADDR", "") self.request.path = environ.get("PATH_INFO") if environ.get('QUERY_STRING'): self.request.query = '?' + environ.get('QUERY_STRING', '') else: self.request.query = '' self.request.fullpath = self.request.path + self.request.query self.request.version = "HTTP/1.1" self.request.headers = {} self.request.headers["Content-Type"] = environ.get("CONTENT_TYPE", "") self.request.headers["Content-Length"] = environ.get("CONTENT_LENGTH", "") for key in environ: if key.startswith("HTTP_"): self.request.headers[key[5:].replace("_", "-")] = environ[key] if self.request.headers.get("Content-Length"): self.request.body = environ["wsgi.input"].read( int(self.request.headers["Content-Length"])) else: self.request.body = "" if environ.get("HTTP_HOST"): self.request.host = environ["HTTP_HOST"] else: self.request.host = environ["SERVER_NAME"] self.response.status = "200 OK" #self.response.headers = {"Content-Type":"text/html; charset=UTF-8"} #如果设置了Content-Type,则必须设置Content-Length,设置了length,可不必设置type #如果不设置Content-Length,就容易出现[Errno 32]Broken pipe self.response.headers = [] #self.response.headers = [("Content-Type","text/html; charset=UTF-8")] self.request.arguments = self.rawinput() self.initialize() def initialize(self): pass def supports_http_1_1(self): """Returns True if this request supports HTTP/1.1 semantics""" return self.request.version == "HTTP/1.1" def rawinput(self, method="both"): method = method.lower() urlparams = urlparse.parse_qs(self.request.query[1:]) if method == "url": return urlparams bodyparams = self._parse_body_arguments() if method == "body": return bodyparams or {} if bodyparams: for k,v in bodyparams.items(): if k in urlparams: urlparams[k] += v else: urlparams[k] = v return urlparams def _parse_body_arguments(self): if self.request.headers['Content-Type'] == "application/x-www-form-urlencoded": return urlparse.parse_qs(self.request.body) elif self.request.headers['Content-Type'] in ("multipart/form-data", "application/octet-stream"): #去除开头和结尾的冗余字符 tmpIndex = self.request.body.find('Content-Type') startIndex = 0 if tmpIndex != -1: startIndex = self.request.body.find('\n', tmpIndex)+3 endIndex = len(self.request.body)-2 while True: if self.request.body[endIndex] == '\n': break else: endIndex -= 1 #截取真实内容 self.request.body = self.request.body[startIndex:endIndex-1] self.request.size = endIndex-startIndex-1 def HEAD(self, *args, **kwargs): return NoMethod(self) def GET(self, *args, **kwargs): return NoMethod(self) def POST(self, *args, **kwargs): return NoMethod(self) def DELETE(self, *args, **kwargs): return NoMethod(self) def PATCH(self, *args, **kwargs): return NoMethod(self) def PUT(self, *args, **kwargs): return NoMethod(self) def OPTIONS(self, *args, **kwargs): return NoMethod(self) def set_header(self, name, value): for x in self.response.headers: if x[0] == name: self.response.headers.remove(x) break self.response.headers.append((name, value)) def get_header(self, name): for x in self.response.headers: if x[0] == name: return x[1] return None def clear_header(self, name): for x in self.response.headers: if x[0] == name: self.response.headers.remove(x) break def cookies(self): """A dictionary of Cookie.Morsel objects.""" if not hasattr(self.request, "_cookies"): if "COOKIE" in self.request.headers: c = self.request.headers['COOKIE'] if '"' in c: cookie = Cookie.SimpleCookie() try: cookie.load(c) self.request._cookies = dict((k, urllib.unquote(v.value)) for k, v in cookie.iteritems()) except Exception: self.request._cookies = {} else: self.request._cookies = {} for key_value in c.split(';'): key_value = key_value.split('=', 1) if len(key_value) == 2: key, value = key_value self.request._cookies[key.strip()] = urllib.unquote(value.strip()) else: self.request._cookies = {} return self.request._cookies def get_cookie(self, name, value=None): return self.cookies().get(name, value) def set_cookie(self, name, value, expires='', domain=None, secure=False, httponly=False, path="/"): morsel = Cookie.Morsel() morsel.set(name, value, urllib.quote(value)) if expires < 0: expires = -1000000000 morsel['expires'] = int(expires) morsel['path'] = path if domain: morsel['domain'] = domain if secure: morsel['secure'] = secure value = morsel.OutputString() if httponly: value += '; httponly' self.response.headers.append(("Set-Cookie", value)) def clear_cookie(self, name, path="/", domain=None): self.set_cookie(name, value="", path=path, domain=domain) def render(self, filename, **kw): self.response.headers = [("Content-Type","text/html; charset=UTF-8")] return getattr(self._render, filename)(**kw) def redirect(self, url): newloc = urlparse.urljoin(self.request.path, url) if url.startswith('/'): newloc = self.request.host + newloc self.response.status = "303 See Other" self.response.headers = {'Content-Type': 'text/html', 'Location': newloc} def cleanup(self): self.request.clear() self.response.clear()
def create(self, order_id, regionid, customer_address, customer_name, customer_mobile, mchnt_address, mchnt_name, mchnt_mobile, order_time, limit_time, order_type=1): """ 创建配送单 :param order_id:订单id :param order_type:订单类型,1:普通订单, 2:内测订单, 选传(默认为1) :param regionid:商圈ID :param customer_address:客户地址 :param customer_name:客户名 :param customer_mobile:客户手机号码 :param mchnt_address:商家地址 :param mchnt_name:商家名称 :param mchnt_mobile:商家号码 :param order_time:下单时间 :param limit_time:送达时间 :return: """ data = Storage() data.orderid = order_id data.type = order_type data.regionid = regionid data.customer_address = customer_address data.customer_name = customer_name data.customer_mobile = customer_mobile data.mchnt_address = mchnt_address data.mchnt_name = mchnt_name data.mchnt_mobile = mchnt_mobile data.order_time = order_time data.limit_time = limit_time data.goods_info = json.dumps(self.goods) data.app_code = self.app_code data.sign = self._gen_sign(data) url = 'http://{host}:{port}/marathon/disorder/create'.format(host=self.host, port=self.port) resp = requests.post(url, data=data, timeout=self.timeout) if resp.status_code != 200: return {"respcd": resp.status_code} ret = json.loads(resp.text) return ret
# coding:utf-8 if __name__ == '__main__': import argparse import time from utils import Storage parser = argparse.ArgumentParser(description='A hred model') args = Storage() parser.add_argument( '--name', type=str, default='hred', help= 'The name of your model, used for variable scope and tensorboard, etc. Default: runXXXXXX_XXXXXX (initialized by current time)' ) parser.add_argument( '--restore', type=str, default='last', help= 'Checkpoints name to load. "last" for last checkpoints, "best" for best checkpoints on dev. Attention: "last" and "best" wiil cause unexpected behaviour when run 2 models in the same dir at the same time. Default: None (don\'t load anything)' ) parser.add_argument('--mode', type=str, default="train", help='"train" or "test". Default: train') parser.add_argument('--dataset', type=str,
def parseli(soup, raw=False): """Parse LinkedIn: Scrapes, scrubs, and returns a dictionary of key LinkedIn data # TODO: Extend profile to include interests + Viewers """ profile = Storage({ "id": '', "avatar": '', "url": '', "name": {}, "location": {}, "headline": '', "industry": '', "viewers": [], "employment": [], "education": [], "connections": '', "summary": '', }) def meta(profile): """gets metadata like unique_id, and profile url""" jstxt = str(soup.findAll('script')) def get_id(x): try: start_id = x.index("newTrkInfo = '") + 14 end_id = x.index(',', start_id) return x[start_id:end_id] except: try: start_id = x.index("user_id: ") end_id = x.index(',', start_id) return x[start_id:end_id] except: member_id = soup.findAll('div', {'class': 'masthead'}) if member_id: return member_id[0]['id'] return '' liid = get_id(jstxt) def get_url(): canonical_url = soup.findAll('link', {'rel': 'canonical'}) return canonical_url[0]['href'] if canonical_url \ else PROFILE_URL.format(liid) profile.id = liid profile.url = get_url() return profile def header(profile): """Parses the profile-header section +------------------------------------------+ | +-------+ given_name family_name | | | title [at institution] | | pic | locality [(area)] | Industry | | | | +-------+ """ header_sec = soup.findAll('div', {'class': 'profile-header'}) if header_sec: header_sec = header_sec[0] avatar = header_sec.findAll('div', {'id': 'profile-picture'}) if avatar: profile.avatar = avatar[0].findAll('img')[0]['src'] demographic = soup.findAll('dl', {"class": 'demographic-info adr'}) name = header_sec.findAll('span', {"class": "full-name"}) headline = header_sec.findAll("p", {"class": "headline-title title"}) # Generally headline is of the form: "Title at Institution" if headline: profile.headline = headline[0].text if not profile.employment: if ' at ' in profile.headline: try: title, institution = profile.headline.split(' at ') profile["employment"].append({"institution": institution, "title": title}) except: pass if name: given_name = name[0].findAll('span', {'class': 'given-name'}) family_name = name[0].findAll('span', {'class': 'family-name'}) profile.name.update({ 'given-name': given_name[0].text if given_name else '', 'family-name': family_name[0].text if family_name else '' }) # Fetch industry, location + area from header section if demographic: demos = demographic[0].findAll('dd') if demos: if len(demos) == 2: industry = demos[1].text profile.industry = industry try: location, area = demos[0].text.replace(")", "").split("(") except: location, area = demos[0].text, "" profile.location = {"locality": location, "area": area} return profile def overview(profile): """Parses the "Overview" section: The overview is used as a last resort to fill in any missing information which could not be obtained by the 'experience' (employment) and 'education' sections. The quality of information it provides is inferior to the aforementioned. given_name family_name's Overview --------------------------------- Current title at institution <0 or n> Past title at institution <0 or n> Education institution <0 or n> """ overview_sec = soup.findAll('dl', {'id': 'overview'}) if overview_sec: if not profile.employment: career_selectors = [\ overview_sec[0].findAll('div', {'class': 'summary-current'}), overview_sec[0].findAll('div', {'class': 'summary-past'}), overview_sec[0].findAll('div', {'class': 'past'}) ] # prune any selector which returns no results, i.e. [], are not lists career_lsts = filter(lambda x: type(x) is list, career_selectors) # if career_lsts contains any non empty lists if any(career_lsts): # reduce on list concat careers = reduce(add, [lst[0] for lst in career_lsts]) for career in careers: title, institution = str(career)[4:-5]\ .replace("\n", "").split('<span class="at">at </span>') profile["employment"].append({"institution": institution, "title": title}) if not profile.education: edu_subsec = overview_sec[0].findAll('dd', {'class': 'summary-education'}) if edu_subsec: edus = edu_subsec[0].findAll('li') for edu in edus: profile['education'].append({'summary': edu.text}) return profile def employment(profile): """Parses the "Experience" section Notes: either dtstatus or dtend is present (exactly one of them) dtstamp signified 'Present' employee dtstamp is resolved to a binary value (1/0) for profile.current given_name family_name's Experience ----------------------------------- # employers <1 to n> title institution dtstart - [dtstamp|dtend] | location """ jobs = soup.findAll('div', {'id': 'profile-experience'}) # If profile "Experience Section" exists if jobs: jobs = jobs[0] careers = jobs.findAll('div', {'class': EMPLOY_SEC_CLS.format("first", "current")}) + \ jobs.findAll('div', {'class': EMPLOY_SEC_CLS.format('', 'current')}) + \ jobs.findAll('div', {'class': EMPLOY_SEC_CLS.format('', 'past')}) for career in careers: title = career.findAll("span", {'class': 'title'}) institution = career.findAll("span", {'class': 'org summary'}) location = career.findAll("span", {'class': 'location'}) description = career.findAll("p", {'class': ' description past-position'}) dtstart = career.findAll('abbr', {'class': "dtstart"}) dtstamp = career.findAll('abbr', {'class': "dtstamp"}) dtend = career.findAll('abbr', {'class': "dtend"}) job = {"title": title[0].text if title else '', "institution": institution[0].text if institution else '', "current": 1 if dtstamp else 0, "location": location[0].text if location else '', "description": description[0].text if description else '', "date": { "start": dtstart[0]['title'] if dtstart else '', "end": dtend[0]['title'] if dtend else '' } } profile["employment"].append(job) return profile def education(profile): """Parses the "Education" section""" section_edu = soup.findAll('div', {'id': 'profile-education'}) if section_edu: section_edu = section_edu[0] edus = section_edu.findAll("div", {"class": EDU_SEC_CLS.format(' first')}) + \ section_edu.findAll("div", {"class": EDU_SEC_CLS.format('')}) for school in edus: institution = school.findAll("h3") degree = school.findAll('span', {'class': 'degree'}) major = school.findAll('span', {'class': 'major'}) dtstart = school.findAll('abbr', {'class': "dtstart"}) dtend = school.findAll('abbr', {'class': "dtend"}) edu = {"institution": institution[0].text if institution else '', "degree": degree[0].text if degree else '', "major": major[0].text if major else '', "dtstart": dtstart[0]['title'] if dtstart else '', "dtend": dtend[0]['title'] if dtend else '' } profile["education"].append(edu) return profile def conns(profile): """User's network size""" cs = soup.findAll('dd', {'class': 'overview-connections'}) if cs: profile['connections'] = cs[0].findAll('strong')[0].text return profile def summary(profile): summary_sec = soup.findAll('div', {'id': 'profile-summary'}) if summary_sec: summary_sec = summary_sec[0] summary_content = summary_sec.findAll('p', {"class": " description summary"}) if summary_content: profile.summary = summary_content[0].text return profile def similar(profile): """Returns a list of similar profile urls, if they exist""" try: ppl = soup.findAll('div', {'id': 'extra'})[0].findAll('a') profile['similar'] = list(set([a['href'] for a in ppl])) except: pass return profile def techtags(profile): """Adds tech tags if they exist""" tags = soup.findAll('ol', {'id': 'skills-list'}) if tags: profile['skills'] = [li.text for li in tags[0].findAll('li')] return profile def interests(profile): """Estimate interests based on groups / affiliations""" groups = soup.findAll('dd', {'id': 'pubgroups'}) if groups: interests = [i.text for i in groups[0].findAll('li')] profile['interests'] = interests return profile profile = summary(similar(interests(techtags(conns(header(overview( education(employment(meta(profile)))))))))) return profile if not raw else json.dumps(profile)
def ssdut_news_parse(raw): ''' parse the raw page src, store all result in a Storage object. all strings are unicode result.soup BeautifulSoup object result.raw raw page src result.hash sha1 hash of the page result.title title result.source 来源 result.date_str - date in string result.date - date object result.body html src of the news body result.clean_body unescaped src of the news body, result.publisher 发表人 ''' soup = bsoup(raw) result = Storage() # raw page / hash result.raw = raw result.soup = soup # title s = soup.find(attrs={'class': re_compile('title')}) result.title = s.text # source text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn")) r = re_compile( ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:") res = r.findall(text)[0] result.source = res[1].rstrip() # date result.date_str = res[0] result.date = datetime.date(*[int(n) for n in result.date_str.split('-')]) # content (body) c = soup.find(attrs={'class': re_compile('content')}) result.body = unicode(c) # content (body) unescaped texts = c.findAll(text=True) all_texts = '\n'.join(texts) result.clean_body = html_parser.unescape(all_texts) # publisher (could be find at the bottom of page) s = soup.find( attrs={ "style": "font-size:14px;float:left;text-align:right;width:80%" }) r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)") #logging.debug("publisher string = %r " % s) try: name = r.findall(s.text)[0] except: logging.warn(" %s has no publisher " % result.title) name = "" # no publisher: like this: index.php/News/8692.html result.publisher = name.rstrip().lstrip() # use utf-8 encoding for k in ['title', 'source', 'body', 'clean_body', 'publisher']: result[k] = result[k].encode('utf-8') hash_src = result.body + result.title + result.publisher if isinstance(hash_src, str): hash_src = unicode(hash_src, "utf-8", "ignore") elif isinstance(hash_src, unicode): pass else: pass result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest() result.search_text = ''.join([result.title, result.source, result.clean_body, result.publisher, result.sha1]) return result
def main(args, load_exclude_set, restoreCallback): logging.basicConfig(\ filename=0,\ level=logging.DEBUG,\ format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\ datefmt='%H:%M:%S') if args.debug: debug() logging.info(json.dumps(args, indent=2)) cuda_init(0, args.cuda) volatile = Storage() volatile.load_exclude_set = load_exclude_set volatile.restoreCallback = restoreCallback data_class = BERTLanguageProcessingBase.load_class('BERT' + args.dataset) data_arg = Storage() data_arg.file_id = args.datapath data_arg.bert_vocab = args.bert_vocab wordvec_class = WordVector.load_class(args.wvclass) if wordvec_class is None: wordvec_class = Glove def load_dataset(data_arg, wvpath, embedding_size): wv = wordvec_class(wvpath) dm = data_class(**data_arg) return dm, wv.load(embedding_size, dm.vocab_list) if args.cache: dm, volatile.wordvec = try_cache( load_dataset, (data_arg, args.wvpath, args.embedding_size), args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__) else: dm, volatile.wordvec = load_dataset(data_arg, args.wvpath, args.embedding_size) volatile.dm = dm param = Storage() param.args = args param.volatile = volatile model = Seq2seq(param) if args.mode == "train": model.train_process() elif args.mode == "test": model.test_process() else: raise ValueError("Unknown mode")
import os import logging from models import Config from utils import Storage __all__ = [ 'config', 'settings', ] logging.info('module config reloaded') settings = Storage() config = Config.get_by_key_name('default') if not config: config = Config(key_name = 'default') config.put() if not config.app_root: settings.app_root = '' else: settings.app_root = '/' + config.app_root.strip('/ ') settings.home_page = settings.app_root + '/'
def forward(self, inp, wLinearLayerCallback, h_init=None, mode='max', input_callback=None, no_unk=True, top_k=10): """ inp contains: batch_size, dm, embLayer, embedding, sampling_proba, max_sent_length, post, post_length, resp_length [init_h] input_callback(i, embedding): if you want to change word embedding at pos i, override this function nextStep(embedding, flag): pass embedding to RNN and get gru_h, flag indicates i th sentence is end when flag[i]==1 wLinearLayerCallback(gru_h): input gru_h and give a probability distribution on vocablist output: w_o emb length""" nextStep, h_now, context = self.init_forward_all(inp.batch_size, inp.post, inp.post_length, h_init=inp.get( "init_h", None)) gen = Storage() gen.w_pro = [] batch_size = inp.embedding.shape[1] seqlen = inp.embedding.shape[0] length = inp.resp_length - 1 start_id = inp.dm.go_id if no_unk else 0 attn_weights = [] first_emb = inp.embLayer(LongTensor([inp.dm.go_id ])).repeat(inp.batch_size, 1) next_emb = first_emb if input_callback: inp.embedding = input_callback(inp.embedding) for i in range(seqlen): proba = random() # Sampling if proba < inp.sampling_proba: now = next_emb if input_callback: now = input_callback(now) # Teacher Forcing else: now = inp.embedding[i] if self.gru_input_attn: h_now = self.cell_forward(torch.cat([now, context], last_dim=-1), h_now) \ * Tensor((length > np.ones(batch_size) * i).astype(float)).unsqueeze(-1) else: h_now = self.cell_forward(now, h_now) \ * Tensor((length > np.ones(batch_size) * i).astype(float)).unsqueeze(-1) query = self.attn_query(h_now) attn_weight = maskedSoftmax( (query.unsqueeze(0) * inp.post).sum(-1), inp.post_length) context = (attn_weight.unsqueeze(-1) * inp.post).sum(0) gru_h = torch.cat([h_now, context], dim=-1) attn_weights.append(attn_weight) w = wLinearLayerCallback(gru_h) gen.w_pro.append(w) # Decoding if mode == "max": w = torch.argmax(w[:, start_id:], dim=1) + start_id next_emb = inp.embLayer(w) elif mode == "gumbel" or mode == "sample": w_onehot = gumbel_max(w[:, start_id:]) w = torch.argmax(w_onehot, dim=1) + start_id next_emb = torch.sum( torch.unsqueeze(w_onehot, -1) * inp.embLayer.weight[start_id:], 1) elif mode == "samplek": _, index = w[:, start_id:].topk(top_k, dim=-1, largest=True, sorted=True) # batch_size, top_k mask = torch.zeros_like(w[:, start_id:]).scatter_(-1, index, 1.0) w_onehot = gumbel_max_with_mask(w[:, start_id:], mask) w = torch.argmax(w_onehot, dim=1) + start_id next_emb = torch.sum( torch.unsqueeze(w_onehot, -1) * inp.embLayer.weight[start_id:], 1) else: raise AttributeError( "The given mode {} is not recognized.".format(mode)) gen.w_pro = torch.stack(gen.w_pro, dim=0) return gen
self.seq = [("alternate_ip", IPField)] BaseMessage.__init__(self, *args, **kwargs) MessageTypes = Storage( { "\x00\x01": ShortResponse, "\x00\x02": LoginRequest, "\x00\x03": LoginReply, "\x00\x04": AlternateServerMessage, "\x00\x05": Logout, "\x00\x06": KeepAlive, "\x00\x07": KeepAliveAck, "\x00\x10": ClientInvite, "\x00\x11": ServerRejectInvite, "\x00\x12": ServerForwardInvite, "\x00\x13": ClientInviteAck, "\x00\x14": ServerForwardRing, "\x00\x15": ClientAnswer, "\x00\x20": ClientRTP, "\x00\x40": HangupRequest, "\x00\x41": HangupRequestAck, "\x00\xa0": ServerOverloaded, } ) def keyof(_v): for k, v in MessageTypes.iteritems(): if _v == v or isinstance(_v, v):
def run(*argv): import argparse import time from utils import Storage parser = argparse.ArgumentParser(description='A hred model') args = Storage() parser.add_argument( '--name', type=str, default='hred', help= 'The name of your model, used for variable scope and tensorboard, etc. Default: runXXXXXX_XXXXXX (initialized by current time)' ) parser.add_argument( '--restore', type=str, default='best', help= 'Checkpoints name to load. "last" for last checkpoints, "best" for best checkpoints on dev. Attention: "last" and "best" wiil cause unexpected behaviour when run 2 models in the same dir at the same time. Default: None (don\'t load anything)' ) parser.add_argument('--mode', type=str, default="train", help='"train" or "test". Default: train') parser.add_argument('--dataset', type=str, default='MyMemHRED', help='Dataloader class. Default: UbuntuCorpus') parser.add_argument('--datapath', type=str, default='../data/film', help='Directory for data set. Default: UbuntuCorpus') parser.add_argument('--epoch', type=int, default=20, help="Epoch for trainning. Default: 100") parser.add_argument( '--wvclass', type=str, default='TencentChinese', help= "Wordvector class, none for not using pretrained wordvec. Default: Glove" ) parser.add_argument( '--wvpath', type=str, default="wordvector/chinese", help= "Directory for pretrained wordvector. Default: resources://Glove300d") parser.add_argument( '--out_dir', type=str, default="./output/film", help='Output directory for test output. Default: ./output') parser.add_argument( '--log_dir', type=str, default="./tensorboard/film", help='Log directory for tensorboard. Default: ./tensorboard') parser.add_argument( '--model_dir', type=str, default="./model/film", help='Checkpoints directory for model. Default: ./model') parser.add_argument( '--cache_dir', type=str, default="./cache/film", help='Checkpoints directory for cache. Default: ./cache') parser.add_argument('--cpu', action="store_true", help='Use cpu.') parser.add_argument('--debug', action='store_true', help='Enter debug mode (using ptvsd).') parser.add_argument( '--cache', action='store_true', help= 'Use cache for speeding up load data and wordvec. (It may cause problems when you switch dataset.)' ) cargs = parser.parse_args(argv) # Editing following arguments to bypass command line. args.name = cargs.name or time.strftime("run%Y%m%d_%H%M%S", time.localtime()) args.restore = cargs.restore args.mode = cargs.mode args.dataset = cargs.dataset args.datapath = cargs.datapath args.epochs = cargs.epoch args.wvclass = cargs.wvclass args.wvpath = cargs.wvpath args.out_dir = cargs.out_dir args.log_dir = cargs.log_dir args.model_dir = cargs.model_dir args.cache_dir = cargs.cache_dir args.debug = cargs.debug args.cache = cargs.cache args.cuda = not cargs.cpu args.softmax_samples = 512 args.embedding_size = 200 args.eh_size = 200 args.ch_size = 200 args.dh_size = 200 args.lr = 1e-3 args.lr_decay = 0.99 args.batch_size = 32 args.grad_clip = 5.0 args.show_sample = [0] args.max_sent_length = 50 args.checkpoint_steps = 100 args.checkpoint_max_to_keep = 5 import random random.seed(0) from main import main main(args)