Example #1
0
 def processSection( self, section, parent, ignore_errors = False ):
     #log.info(section.text[:128])
     try:
         if section.name == '[document]' and section.P1group:
             section = section.P1group.P1
             pre = section.parent.Title.text
             # now, regardless of how the soup is obtained or passed in, we have a common starting point. 
             # we will reference parent nodes to obtain information outside the P1group. 
         elif section.P1:
             section = section.P1
             pre = ''
         else:
             print section
         id = utf8(section['id'])
         #find references
         refs = section.findParent("[document]").select("Commentary CitationSubRef[SectionRef={}]".format(id))
         meta = ''
         for ref in refs:
             meta += ref.parent.text
         text = section.text
     except KeyboardInterrupt:
         raise
     except:
         if ignore_errors:
             log.info('Ignoring section in {}'.format(parent.cid))
         else:
             raise
     code = self.getChild( id, parent, depth=3 )
     code.meta = utf8(meta)
     code.setPre(pre)
     code.setText(text)
     return code
Example #2
0
def urlencode(data):
    params = []
    for key, value in data.items():
        if value == None:
            continue

        params.append("%s=%s" % (util.utf8(key), 
            quote_plus(util.utf8(value))))

    params_string = '&'.join(params)
    return params_string
Example #3
0
    def request(self, method, url, headers, post_data=None):
        s = util.StringIO.StringIO()
        curl = pycurl.Curl()

        if method == 'get':
            curl.setopt(pycurl.HTTPGET, 1)
        elif method == 'post':
            curl.setopt(pycurl.POST, 1)
            curl.setopt(pycurl.POSTFIELDS, post_data)
        else:
            curl.setopt(pycurl.CUSTOMREQUEST, method.upper())

        # pycurl doesn't like unicode URLs
        curl.setopt(pycurl.URL, util.utf8(url))

        curl.setopt(pycurl.WRITEFUNCTION, s.write)
        curl.setopt(pycurl.NOSIGNAL, 1)
        curl.setopt(pycurl.CONNECTTIMEOUT, 30)
        curl.setopt(pycurl.TIMEOUT, 80)
        curl.setopt(pycurl.HTTPHEADER,
                    ['%s: %s' % (k, v) for k, v in headers.iteritems()])
        if self._verify_ssl_certs:
            curl.setopt(
                pycurl.CAINFO,
                os.path.join(os.path.dirname(__file__),
                             'data/ca-certificates.crt'))
        else:
            curl.setopt(pycurl.SSL_VERIFYHOST, False)

        try:
            curl.perform()
        except pycurl.error, e:
            self._handle_request_error(e)
Example #4
0
 def processSection(self, section, parent):
     if 'identifier' not in section.attrs:
         log.debug('skipping section without identifier...')
     elif section.has_attr('status') and section['status'] in ('omitted', 'repealed'):
         log.debug('Skipping section %s because of status: %s' % (section['identifier'], section['status']))
     else:
         code = self.findOrCreateCode(section['identifier'])
         code.parent = parent
         code.depth = 3
         
         if section.sourcecredit:
             credits = section.select('sourcecredit')
             code.meta = utf8('; '.join(c.text for c in credits))
         
         #rather than iterating through every child node, clear the contents of the nodes
         #we would like to ignore
         cleartags = ('ref',
                      'table',
                      'notes',
                      'note'
                      )
         for tag in cleartags:
             ctags = section.select(tag)
             for ctag in ctags:
                 ctag.clear()
         
         code.setContent(
             section.num.text + section.heading.text,
             section.text,
             ''
         )
         
     self.progress()
Example #5
0
    def __init__(self, method, url, headers = None, data = None, files = None, debug = False, cookies = None, auto_redirect = False):
        assert url.startswith('http')
        url = util.utf8(url)
        self.url = url
        self.method = method
        self.data = data or {}
        self.files = files
        self.body = None
        self.auto_redirect = auto_redirect

        cookies = cookies or {}

        for name, value in cookies.items():
            cookie_manager.set_cookie(name, value)

        _split_url = httplib.urlsplit(url)
        self.host = _split_url.netloc
        self.uri = _split_url.path

        if _split_url.query:
            self.uri += '?' + _split_url.query

        if _split_url.scheme == 'https':
            Connection = httplib.HTTPSConnection
        else:
            Connection = httplib.HTTPConnection

        self.__conn = Connection(host = self.host)
        self.__conn.set_debuglevel(debug and logging.DEBUG or 0)

        self.headers = headers or {}
        self.generate_header(headers)
Example #6
0
 def processAct( self, act, root, bar=None, ignore_errors=False ):
     # load the contents of each act first, if it has parts, collect each part for processing
     # if it only has sections, process each part individually
     contentsurl = 'http://www.legislation.gov.uk/ukpga/{}/contents/data.xml'
     soup = self.getsoup( contentsurl.format(act) )
     parts = soup.select('Legislation > Contents > ContentsPart')
     sections = soup.select('Legislation > Contents ContentsItem[ContentRef^=section-]')
     pre = soup.Primary.text if soup.Primary else ''
     if not any( (parts, sections) ):
         # no data available for act - we're outta here
         log.warn('No data available for {}'.format(act))
         return None
     
     act = self.getChild(act, root)
     act.rev = root.rev
     act.setPre( pre )
     
     meta = utf8(soup.title.text) if soup.title else ''
     
     schedurl = ''
     parturls = []
     secturls = []   
     # now we get the schedules - these are included in the post because ... reasons
     # really it's the only place for them unless we modify the code to accomodate
     # different code structural items at the same depth
     if soup.ContentsSchedules:
         schedurl = 'http://www.legislation.gov.uk/ukpga/{}/schedules/data.xml'.format(act.cid) 
     if parts:
         for part in parts:
             parturls.append( '{}/data.xml'.format(part['DocumentURI']) )
     elif sections:
         for section in sections:
             secturls.append( '{}/data.xml'.format(section['DocumentURI']) )
     urls = parturls + secturls
     if schedurl: urls.append(schedurl)
     soups = self.get_bulk_soup( urls, bar=bar )
     if not bar:
         bar = progress.Bar()
     bar.label = act.cid
     bar.expected_size = len(soups)
     bar.show(0)
     if schedurl:
         schedsoup = soups[schedurl]
         act.setPost( schedsoup.Schedules.text )
         bar.show(bar.last_progress + 1)
     if parturls:
         for parturl in parturls:
             partsoup = soups[parturl]
             self.processPart( partsoup, act, ignore_errors=ignore_errors )
             bar.show(bar.last_progress+1)
     elif secturls:
         for secturl in secturls:
             sectionsoup = soups[secturl]
             self.processSection(sectionsoup, act, ignore_errors=ignore_errors)
             bar.show(bar.last_progress+1)
     return act
Example #7
0
 def __contains__( self, i ):
     if type(i) is unicode:
         i = utf8(i)
     if self.children().find(Code.cid == i).count():
         return True
     else:
         childids = Select(Code.id, Code.parent_id == self.id)
         if Code.find(Code.cid == i, Code.parent_id.is_in(childids) ).count():
             return True
     return False
Example #8
0
    def __init__(self, name, id, title, score="id", condition_fields=None,
                    prefix_index_enable=True, exts=None, **kwargs):

        if isinstance(exts, dict):
            kwargs.update(exts)
        self.name  = name
        self.title = utf8(title)
        self.id    = id
        self.score = score
        self.exts  = kwargs
        self.condition_fields = condition_fields if condition_fields and isinstance(condition_fields, list) else []
        self.prefix_index_enable = prefix_index_enable
Example #9
0
    def __request(self):
        def utf8_headers(headers):
            _headers = {}
            for key, value in headers.items():
                _headers[util.utf8(key)] = util.utf8(value)

            return _headers

        conn = self.__conn
        conn.request(util.utf8(self.method), util.utf8(self.uri), body = util.utf8(self.body),
            headers = utf8_headers(self.headers))

        global LAST_URL
        LAST_URL = self.url

        response = conn.getresponse()
        cookie_manager.set_cookie(response.getheader('set-cookie'))
        if self.auto_redirect and response.getheader('location'):
            return self.redirect(response.getheader('location'))
        else:
            return Response(response)
Example #10
0
 def findOrCreateCode( self, cid ):
     cid = utf8(cid)
     #log.debug("Looking for code identified as %s" % cid)
     c = store.find(Code, Code.rp == self.rp, Code.cid == cid)
     if c.is_empty():
         c = store.add(Code())
         c.nation = self.nation
         c.rp = self.rp
         c.cid = cid
         c.stored = datetime.now()
     else:
         c = c[0]
     return c
Example #11
0
    def save(self):
        """docstring for save"""

        if not self.title:
            return False

        data = {
            'name': self.name,
            'id': self.id,
            'title': self.title
        }

        if self.exts:
            data.update(self.exts)

        pipe = util.redis.pipeline()

        # 将原始数据存入 hashes
        res = pipe.hset(self.name, self.id, json.dumps(data))

        # 保存 sets 索引,以分词的单词为key,用于后面搜索,里面存储 ids
        words = self.split_words_for_index(self.title)

        if not words:
            logging.info("no words")
            return False

        for word in words:
            key = mk_sets_key(self.name, word)

            # word index for item id
            pipe.sadd(key, self.id)

        if self.score == 'id':
            self.score = self.id

        # score for search sort
        pipe.set(mk_score_key(self.name, self.id), self.score)

        # 将目前的编号保存到条件(conditions)字段所创立的索引上面
        for field in self.condition_fields:
            pipe.sadd(mk_condition_key(self.name, field, utf8(data[field])), self.id)

        # commit
        pipe.execute()

        if self.prefix_index_enable:
            self.save_prefix_index()
Example #12
0
    def save(self):
        """docstring for save"""

        if not self.title:
            return False

        data = {'name': self.name, 'id': self.id, 'title': self.title}

        if self.exts:
            data.update(self.exts)

        pipe = util.redis.pipeline()

        # 将原始数据存入 hashes
        res = pipe.hset(self.name, self.id, json.dumps(data))

        # 保存 sets 索引,以分词的单词为key,用于后面搜索,里面存储 ids
        words = self.split_words_for_index(self.title)

        if not words:
            logging.info("no words")
            return False

        for word in words:
            key = mk_sets_key(self.name, word)

            # word index for item id
            pipe.sadd(key, self.id)

        if self.score == 'id':
            self.score = self.id

        # score for search sort
        pipe.set(mk_score_key(self.name, self.id), self.score)

        # 将目前的编号保存到条件(conditions)字段所创立的索引上面
        for field in self.condition_fields:
            pipe.sadd(mk_condition_key(self.name, field, utf8(data[field])),
                      self.id)

        # commit
        pipe.execute()

        if self.prefix_index_enable:
            self.save_prefix_index()
Example #13
0
 def processPart( self, part, parent, ignore_errors=False ):
     try:
         part = part.Part
         id = utf8(part['id'])
         sections = part.select('P1group P1[id^=section-]')
         if not sections:
             log.info( 'ERROR: No sections in {} of {}'.format(part['id'] if part.has_attr('id') else '', parent.cid) )
             return None
         
         title = (part.Number.text if part.Number else '' + ' ' + part.Title.text if part.Title else '').replace('\n', '')
     except:
         log.error( 'ERROR: Unable to get all data for {}'.format(id) )
         raise
     code = self.getChild( id, parent )
     code.setPre(title)
     for section in sections:
         self.processSection( section.parent, code, ignore_errors=ignore_errors )
     return code
Example #14
0
    def handle(self):
        request = self._request
        path = request.path

        handler = None
        args = []

        for spec in self.handlers:
            match = spec.regex.match(path)
            if match:
                handler = spec.handler_class(self, request)
                args = [unicode(urllib.unquote_plus(utf8(m)), "utf-8")
                        for m in match.groups()]
                break

        if not handler:
            handler = ErrorHandler(self, request, status_code=404)

        handler._execute(*args)
        return handler
Example #15
0
    def __init__(self,
                 name,
                 id,
                 title,
                 score="id",
                 condition_fields=None,
                 prefix_index_enable=True,
                 exts=None,
                 **kwargs):

        if isinstance(exts, dict):
            kwargs.update(exts)

        self.name = name
        self.title = utf8(title)
        self.id = id
        self.score = score
        self.exts = kwargs
        self.condition_fields = condition_fields if condition_fields and isinstance(
            condition_fields, list) else []
        self.prefix_index_enable = prefix_index_enable
Example #16
0
 def __getitem__( self, i ):
     if type(i) is unicode:
         i = utf8(i)
     child = self.children().find(Code.cid == i).one()
     if not child:
         # first try this
         childids = Select(Code.id, Code.parent_id == self.id)
         child = Code.find(Code.cid == i, Code.parent_id.is_in(childids) ).one()
         if child:
             return child
         for child in self.children():
             c = None
             try:
                 c = child[i]
             except:
                 pass
             if c:
                 return c
             else:
                 pass
         raise IndexError("Child code not found with cid {}".format(i))
     else:
         return child
Example #17
0
def utf8_join_flatten(items):
    return "".join(utf8(item) for item in flatten(items))
Example #18
0
 def processAct( self, actcid, parent ):
     log.info('Processing act: %s' % actcid)
     soup = self.getActSoup(actcid)
     act = self.findOrCreateAct(parent.released, actcid, parent.rev)
     act.parent = parent
     act.cid = actcid
     act.released = parent.released
     act.rev = parent.rev
     act.depth = 1
     act.pre = Text.make( soup.title.text )
     act.text = Text.make( soup.select("section.intro")[0].text )
     act.meta = utf8(soup.select("p#assentedDate")[0].text.rpartition('.')[0])
     doc = soup.select("div.docContents div")[0]
     #so much easier to use the CSS selector
     #sections = [i['id'] for i in doc.select("[id]") if i['id'].startswith('h-')]
     id_prefix = 'h-'
     sections = [i['id'] for i in doc.select('[id^={}]'.format(id_prefix))]
     classAndTag = lambda o: isinstance(o, Tag) and o.has_attr('class')
     
     if sections:
         for secid in progress.bar(sections, label=act.cid):
             sec = self.findOrCreateSection(act.released, secid, act)
             soup = doc.select("[id=%s]" % secid)[0]
             sec.pre = Text.make(soup.text)
             sec.cid = secid
             sec.depth=2
             sec.parent = act
             sec.released = act.released
             sec.rev = act.rev
             stop = False
             sib = soup.nextSibling
             content = ""
             for t in soup.select(".wb-invisible"):
                 t.clear()
             while not stop:
                 if classAndTag(sib):
                     if sib.has_attr('id') and sib['id'].startswith('h-'):
                         stop = True
                     elif sib.name == 'section':
                         stop = True
                     elif any( c in ['Definition', 
                                     'Section',
                                     'MarginalNote', 
                                     'ProvisionList', 
                                     'Part', 
                                     'Subheading', 
                                     'MarginalNoteDefinedTerm',
                                     'ContinuedSectionSubsection',
                                     'Oath'] for c in sib['class']):
                         content += sib.text
                     elif sib['class'][0].startswith('indent'):
                         content += sib.text
                     elif sib['class'][0] == 'HistoricalNote':
                         sec.meta = utf8(sib.text)
                     elif sib['class'][0] in ['PITLink',
                                              'nif']:
                         pass
                     else:
                         log.info('Unhandled case in parsing section %s/%s' % (act.cid, secid))
                         log.debug(sib.name)
                         log.debug(sib.attrs)
                 if not sib or not sib.nextSibling:
                     stop = True
                 if not stop:
                     sib = sib.nextSibling
             sec.text = Text.make(content)
             sec.stored = now()
             schedules = soup.select('div [class=Schedule]')
             post = ''
             for sched in schedules:
                 post += sched.text
             act.post = Text.make(post)
             act.stored = now()
     else:
         #alternative section method
         #for this method we switch to the XML version and pull identifying information 
         #out of the code = attribute. Annecdotally, this seems to need to be done for
         #very small acts
         log.info('Switching to alternate section method')
         soup = self.getActXMLSoup(act.cid)
         sections = soup.select("section[code^=se=]")
         for section in sections:
             try:
                 secid = section['code'].replace('=', '-').replace('"', '')
                 pre = ''
                 pre = section.label.text + ' ' if section.label else pre
                 pre = pre + section.marginalnote.text if section.marginalnote else pre
                 text = section.select_one('text').text
             except:
                 log.warn('ERROR in alternate parsing method for {}.{}'.format(act.cid, secid))
                 raise
             if 'repealed' in text.lower():
                 pass
             else:
                 sec = self.findOrCreateSection(act.released, secid, act)
                 sec.setPre(pre)
                 sec.setText(text)
                 sec.parent = act
                 sec.depth = 2
                 sec.released = act.released
                 sec.rev = act.rev
                 sec.cid = secid
     act.analyze()
     store.commit()
     return act
Example #19
0
File: __init__.py Project: Cue/qc
def str(length=None, maxlen=None):
  """An arbitrary string. UTF-8 encoded."""
  while True:
    yield _util.utf8(_str(length, maxlen))
Example #20
0
def query(name, text, offset=0, limit=10, sort_field='id', conditions=None):
    """docstring for query"""

    conditions = conditions if isinstance(conditions, dict) and conditions else {}

    tm = time.time()
    result = []

    # 如果搜索文本和查询条件均没有,那就直接返回 []
    if not text.strip() and not conditions:
        return result

    text = utf8(text.strip())
    splited_words = split_words(text)

    words = [mk_sets_key(name, word) for word in splited_words]

    if conditions:
        condition_keys = [mk_condition_key(name, c, utf8(conditions[c]))
                          for c in conditions]
        # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索
        words += condition_keys
    else:
        condition_keys = []

    if not words:
        return result

    temp_store_key = "tmpinterstore:%s" % "+".join(words)

    if len(words) > 1:
        if not util.redis.exists(temp_store_key):
            # 将多个词语组合对比,得到交集,并存入临时区域
            util.redis.sinterstore(temp_store_key, words)
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_store_key, 86400)
        # 拼音搜索
        if util.pinyin_match:
            splited_pinyin_words = split_pinyin(text)

            pinyin_words = [mk_sets_key(name, w) for w in splited_pinyin_words]
            pinyin_words += condition_keys
            temp_sunion_key = "tmpsunionstore:%s" % "+".join(words)
            temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words)
            # 找出拼音的交集
            util.redis.sinterstore(temp_pinyin_store_key, pinyin_words)
            # 合并中文和拼音的搜索结果
            util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key])
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_pinyin_store_key, 86400)
            util.redis.expire(temp_sunion_key, 86400)
            temp_store_key = temp_sunion_key
    else:
        temp_store_key = words[0]

    # 根据需要的数量取出 ids
    ids = util.redis.sort(temp_store_key,
                          start=offset,
                          num=limit,
                          by=mk_score_key(name, "*"),
                          desc=True)
    result = hmget(name, ids, sort_field=sort_field)
    logging.debug("{}:\"{}\" | Time spend:{}s".format(name, text, time.time()-tm))
    return result
Example #21
0
        def utf8_headers(headers):
            _headers = {}
            for key, value in headers.items():
                _headers[util.utf8(key)] = util.utf8(value)

            return _headers
Example #22
0
def complete(name, keyword, limit=10, conditions=None):
    """docstring for complete"""

    conditions = conditions if isinstance(conditions, dict) and conditions else {}

    if not keyword and not conditions:
        logging.debug("no word and conditions")
        return []

    keyword = utf8(keyword.strip())
    prefix_matchs = []
    
    # This is not random, try to get replies < MTU size
    rangelen = util.complete_max_length
    prefix = keyword.lower()
    key = mk_complete_key(name)

    start = util.redis.zrank(key, prefix)

    if start:
        count = limit
        max_range = start+(rangelen*limit)-1
        entries = util.redis.zrange(key, start, max_range)
        
        while len(prefix_matchs) <= count:
            
            start += rangelen
            if not entries or len(entries) == 0:
                break
            
            for entry in entries:
                minlen = min(len(entry), len(prefix))

                if entry[0:minlen] != prefix[0:minlen]:
                    count = len(prefix_matchs)
                    break

                if entry[-1] == "*" and len(prefix_matchs) != count:

                    match = entry[:-1]
                    if match not in prefix_matchs:
                        prefix_matchs.append(match)
          
            entries = entries[start:max_range]

    # 组合 words 的特别 key 名
    words = []
    for word in prefix_matchs:
        words.append(mk_sets_key(name, word))

    # 组合特别 key ,但这里不会像 query 那样放入 words, 因为在 complete 里面 words 是用 union 取的,condition_keys 和 words 应该取交集
    condition_keys = []
    if conditions:
        for c in conditions:
            condition_keys.append(mk_condition_key(name, c, utf8(conditions[c])))
    
    # 按词语搜索
    temp_store_key = "tmpsunionstore:%s" % "+".join(words)
    if len(words) == 0:
        logging.info("no words")
    elif len(words) > 1:
        if not util.redis.exists(temp_store_key):
            
            # 将多个词语组合对比,得到并集,并存入临时区域   
            util.redis.sunionstore(temp_store_key, words)
            
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_store_key, 86400)
        # 根据需要的数量取出 ids
    else:
        temp_store_key = words[0]

    # 如果有条件,这里再次组合一下
    if condition_keys:
        if not words:
            condition_keys += temp_store_key
            
        temp_store_key = "tmpsinterstore:%s" % "+".join(condition_keys)
        if not util.redis.exists(temp_store_key):
            util.redis.sinterstore(temp_store_key, condition_keys)
            util.redis.expire(temp_store_key, 86400)
     
    ids = util.redis.sort(temp_store_key,
                    start = 0,
                    num = limit,
                    by = mk_score_key(name, "*"),
                    desc = True)
    if not ids:
        return []
        
    return util.hmget(name, ids)
Example #23
0
 def scrape (self):
     if not self.rp:
         r = Cache.get(self.nation.cfg['entrypoint'])
         soup = BS(str(r))
         #find current release point
         log.info("No release point specified, retreiving latest...")
         # this failed fantastically - we'll get the RP from the zipurl
         #self.rp = utf8(soup.findAll('h3', attrs={'class': 'releasepointinformation'})[0].text.split()[-1])
         log.info("Found release point %s" % self.rp)
         #find the download url
         self.zipurl = self.nation.cfg['entrypoint'].rpartition('/')[0] + '/' + soup.findAll('a', title='All USC Titles in XML')[0]['href']
         # new way to set the rp using the zipurl's filename
         self.rp = utf8( self.zipurl.rpartition('@')[-1].partition('.')[0] )
     else:
         log.info('Using specified release point %s...' % self.rp)
         # don't actually need this
         # rpurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/usc-rp@%s.htm' % (tuple(self.rp.split()) + (self.rp,))
         self.zipurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/xml_uscAll@%s.zip'  % (tuple(self.rp.split('-')) + (self.rp,))
     
     log.debug("Using zipurl: %s" % self.zipurl)
     
     class FileNotThere (Exception): pass
     class XMLNotThere( Exception ): pass
     class AllGood( Exception ): pass
     
     filename = self.zipurl.rpartition('/')[-1]
     xmldir = self._workdir + os.sep + 'xml' + os.sep
     
     # check to see if we have xml that works
     # if we don't check to see if we have a zip file
     # if we don't, download it
     # if we do, extract it
     # check the xml again, if it's good, proceed
     # if it's not, error out
     
     try:
         assert os.path.exists(xmldir + 'usc01.xml')
         soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read())
         xmlrp = soup.find('docpublicationname').text.split('@')[-1]
         #old way to get rp, the new way is much better
         # xmlrp = soup.title.first("note", topic="miscellaneous").text.split()[-1]
         if xmlrp == self.rp:
             raise AllGood
         else:
             raise XMLNotThere
     except (XMLNotThere,AssertionError):
         # delete directory if it exists
         if os.path.exists(xmldir):
             shutil.rmtree(xmldir)
         # if there's no xml file, download it
         if not os.path.exists(self._workdir + os.sep + filename):
             log.info('No zipfile found for release point, downloading...')
             self.downloadFile(self.zipurl, filename)
         # now we should have a zipfile and no existing xmldir
         log.info('Extracting file %s...' % filename)
         zf = ZipFile(self._workdir + os.sep + filename, 'r')
         # older release points do not have an interior xml/ dir
         if not all( [ n.startswith('xml/') for n in zf.namelist()]):
             zf.extractall(xmldir)
         else:
             zf.extractall(self._workdir)
         # double check the xml now...
         assert os.path.exists(xmldir + 'usc01.xml')
         # it may be problematic to rely on the RP information in the XML documents provided
         # rp 113-21 (the earliest presently available) does not include this in the 
         # docpublicationname meta tag
         #soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read())
         #xmlrp = soup.find('docpublicationname').text.split('@')[-1]
         #if xmlrp != self.rp:
         #    raise XMLNotThere('XML did not check out after extraction.')
     except AllGood:
         pass
     except:
         raise
     
     log.info('All checks passed...')
     xf = os.listdir(xmldir)
     root = self.findOrCreateRoot()
     xf = [xmldir + f for f in xf if f.endswith('.xml')]
     xf.sort()
     log.info("Processing %i files..." % len(xf))
     self.bar = progress.Bar(label='US', expected_size=1000*len(xf))
     self.progress( i=len(xf) )
     for fn in xf:
         self.processFile(fn, root)
         self.progress(rollup=1000)
     log.info('Analyzing code...')
     self.progress(label="Analyzing")
     root.analyze(commit=True, bar=self.bar)
     store.commit()
     log.info('Scrape completed.')
Example #24
0
def complete(name, keyword, limit=10, conditions=None):
    """complete: prefix match search
        keyword
        limit: max match count"""

    conditions = conditions if isinstance(conditions, dict) and conditions else {}

    if not keyword and not conditions:
        logging.debug("no word and conditions")
        return []

    keyword = utf8(keyword.strip())
    prefix_matchs = []

    # This is not random, try to get replies < MTU size
    rangelen = util.complete_max_length
    prefix = keyword.lower()
    key = mk_complete_key(name)

    start = util.redis.zrank(key, prefix)

    if start:
        count = limit
        max_range = start + (rangelen * limit) - 1
        entries = util.redis.zrange(key, start, max_range)
        while len(prefix_matchs) <= count:
            start += rangelen
            if not entries or len(entries) == 0:
                break
            #entries sorted in desc so once entry is inconsistence with prefix will break
            for entry in entries:
                minlen = min(len(entry), len(prefix))

                #this entry break the consistency with prefix
                if entry[0:minlen] != prefix[0:minlen]:
                    count = len(prefix_matchs)
                    break

                # found matched entry
                if entry[-1] == "*" and len(prefix_matchs) != count:
                    match = entry[:-1]
                    if match not in prefix_matchs:
                        prefix_matchs.append(match)
            entries = entries[start:max_range]

    # 组合 words 的特别 key 名
    words = [mk_sets_key(name, word) for word in prefix_matchs]

    # 组合特别key,但这里不会像query那样放入words,
    # 因为在complete里面words是用union取的,condition_keys和words应该取交集
    condition_keys = [mk_condition_key(name, c, utf8(conditions[c]))
                      for c in conditions]
    # 按词语搜索
    temp_store_key = "tmpsunionstore:%s" % "+".join(words)
    if len(words) == 0:
        logging.info("no words")
    elif len(words) > 1:
        if not util.redis.exists(temp_store_key):
            # 将多个词语组合对比,得到并集,并存入临时区域
            util.redis.sunionstore(temp_store_key, words)
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_store_key, 86400)
        # 根据需要的数量取出 ids
    else:
        temp_store_key = words[0]

    # 如果有条件,这里再次组合一下
    if condition_keys:
        if not words:
            condition_keys += temp_store_key
        temp_store_key = "tmpsinterstore:%s" % "+".join(condition_keys)
        if not util.redis.exists(temp_store_key):
            util.redis.sinterstore(temp_store_key, condition_keys)
            util.redis.expire(temp_store_key, 86400)

    ids = util.redis.sort(temp_store_key,
                          start=0,
                          num=limit,
                          by=mk_score_key(name, "*"),
                          desc=True)
    if not ids:
        return []
    return hmget(name, ids)
Example #25
0
 def write(self, chunk):
     if isinstance(chunk, dict):
         chunk = json_encode(chunk)
         self.set_header("Content-Type", "application/json; charset=UTF-8")
     chunk = utf8(chunk)
     self._write_buffer.append(chunk)
Example #26
0
 def set_header(self, key, value):
     self.headers[util.utf8(key)] = util.utf8(value)
Example #27
0
def str(length=None, maxlen=None):
    """An arbitrary string. UTF-8 encoded."""
    while True:
        yield _util.utf8(_str(length, maxlen))
Example #28
0
def query(name, text, offset=0, limit=10, sort_field='id', conditions=None):
    """docstring for query"""

    conditions = conditions if isinstance(conditions, dict) and conditions else {}

    tm = time.time()
    result = []

    # 如果搜索文本和查询条件均没有,那就直接返回 []
    if not text.strip() and not conditions:
        return result

    text = utf8(text.strip())
    splited_words = split_words(text)

    words = []
    for word in splited_words:
        words.append(mk_sets_key(name, word))

    condition_keys = []
    if conditions:
        for c in conditions:
            condition_keys.append(mk_condition_key(name, c, utf8(conditions[c])))
            
        # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索
        words += condition_keys
    
    if not words:
        return result

    temp_store_key = "tmpinterstore:%s" % "+".join(words)
    
    if len(words) > 1:
        if not util.redis.exists(temp_store_key):
            # 将多个词语组合对比,得到交集,并存入临时区域
            util.redis.sinterstore(temp_store_key, words)
            
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_store_key, 86400)
        
        # 拼音搜索
        if util.pinyin_match:
            splited_pinyin_words = split_pinyin(text)

            pinyin_words = []
            for w in splited_pinyin_words:
                pinyin_words.append(mk_sets_key(name, w))
                
            pinyin_words += condition_keys
            
            temp_sunion_key = "tmpsunionstore:%s" % "+".join(words)
            temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words)
            
            # 找出拼音的
            util.redis.sinterstore(temp_pinyin_store_key, pinyin_words)
            
            # 合并中文和拼音的搜索结果
            util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key])
            
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_pinyin_store_key, 86400)
            util.redis.expire(temp_sunion_key, 86400)
            
            temp_store_key = temp_sunion_key
    else:
        temp_store_key = words[0]

    # 根据需要的数量取出 ids
    ids = util.redis.sort(temp_store_key,
                    start = offset,
                    num = limit,
                    by = mk_score_key(name, "*"),
                    desc = True)

    result = util.hmget(name, ids, sort_field=sort_field)
    logging.debug("%s:\"%s\" | Time spend:%ss" % (name, text, time.time()-tm))
    return result