Exemple #1
0
 def __init__(self, indexdir):
     exists = index.exists_in(indexdir)
     if exists:
         self.ix = index.open_dir(indexdir)
     else:
         schema = Schema(title=TEXT(stored=True),
                         id=NUMERIC(unique=True, stored=True),
                         orgid=NUMERIC(stored=True),
                         ishunterjob=NUMERIC(stored=True),
                         tags=KEYWORD(stored=True))
         self.ix = create_in(indexdir, schema)
     self.mac_address = self.get_mac_address()
     self.conn = pymongo.Connection(config.MONGO_CONN)
     self.tagsParser = Trie(config.SKILL_FILE)
     self.cache = LRUCache(1024)
Exemple #2
0
class update_task(Process):
    def __init__(self, task_queue, ix):
        Process.__init__(self)
        self.task_queue = task_queue
        self.ix = ix
        self.tagsParser = Trie(config.SKILL_FILE)

    def cut(self, value):
        value = value.lower().replace('&nbsp', '')
        value = value.encode('UTF-8')
        terms = self.tagsParser.parse(value)
        v = {}
        for i in terms:
            v[i[0]] = i[1]
        return v.values()

    def update_doc(self, jdata):
        writer = self.ix.writer()
        for j in jdata['fields']:
            tags = self.cut(j['jobname'] + ' ' + j['description'])
            jobid = j['jobid']
            orgid = j['orgid']
            jobname = unicode(j['jobname'])
            tags = ' '.join(tags).decode('UTF-8')
            ishunterjob = j['ishunterjob']

            writer.update_document(id=jobid,
                                   orgid=orgid,
                                   title=jobname,
                                   tags=tags,
                                   ishunterjob=ishunterjob)
            logger.info('update doc :' + str(jobid))

        writer.commit()

    def del_doc(self, id):
        self.ix.delete_by_term('id', id)
        logger.info('del doc :' + str(id))

    def run(self):
        logger.info('启动异步任务队列')
        while (True):
            task = self.task_queue.get(1)
            if task[0] == 1:
                jdata = task[1]
                self.update_doc(jdata)
            elif task[0] == 2:
                id = task[1]
                self.del_doc(id)
Exemple #3
0
 def __init__(self,indexdir):
     exists = index.exists_in(indexdir)
     if exists :
         self.ix =index.open_dir(indexdir)
     else:
         schema = Schema(title=TEXT(stored=True),
                 id=NUMERIC(unique=True,stored=True),
                 orgid=NUMERIC(stored=True),
                 ishunterjob=NUMERIC(stored=True),
                 tags=KEYWORD(stored=True)
                 )
         self.ix = create_in(indexdir, schema)
     self.mac_address=self.get_mac_address()
     self.conn = pymongo.Connection(config.MONGO_CONN)
     self.tagsParser = Trie(config.SKILL_FILE)
     self.cache = LRUCache(1024)
Exemple #4
0
class update_task(Process):
    def __init__(self, task_queue, ix):
        Process.__init__(self)
        self.task_queue = task_queue
        self.ix = ix
        self.tagsParser = Trie(config.SKILL_FILE)

    def cut(self, value):
        value = value.lower().replace("&nbsp", "")
        value = value.encode("UTF-8")
        terms = self.tagsParser.parse(value)
        v = {}
        for i in terms:
            v[i[0]] = i[1]
        return v.values()

    def update_doc(self, jdata):
        writer = self.ix.writer()
        for j in jdata["fields"]:
            tags = self.cut(j["jobname"] + " " + j["description"])
            jobid = j["jobid"]
            orgid = j["orgid"]
            jobname = unicode(j["jobname"])
            tags = " ".join(tags).decode("UTF-8")
            ishunterjob = j["ishunterjob"]

            writer.update_document(id=jobid, orgid=orgid, title=jobname, tags=tags, ishunterjob=ishunterjob)
            logger.info("update doc :" + str(jobid))

        writer.commit()

    def del_doc(self, id):
        self.ix.delete_by_term("id", id)
        logger.info("del doc :" + str(id))

    def run(self):
        logger.info("启动异步任务队列")
        while True:
            task = self.task_queue.get(1)
            if task[0] == 1:
                jdata = task[1]
                self.update_doc(jdata)
            elif task[0] == 2:
                id = task[1]
                self.del_doc(id)
Exemple #5
0
class ADIndex:
    def get_mac_address(self):
        node = uuid.getnode()
        mac = uuid.UUID(int=node).hex[-12:]
        return mac

    def __init__(self, indexdir):
        exists = index.exists_in(indexdir)
        if exists:
            self.ix = index.open_dir(indexdir)
        else:
            schema = Schema(title=TEXT(stored=True),
                            id=NUMERIC(unique=True, stored=True),
                            orgid=NUMERIC(stored=True),
                            ishunterjob=NUMERIC(stored=True),
                            tags=KEYWORD(stored=True))
            self.ix = create_in(indexdir, schema)
        self.mac_address = self.get_mac_address()
        self.conn = pymongo.Connection(config.MONGO_CONN)
        self.tagsParser = Trie(config.SKILL_FILE)
        self.cache = LRUCache(1024)

    def add_doc(self, jobs):
        writer = self.ix.writer()
        rep = []
        for j in jobs:
            writer.update_document(id=j[0],
                                   orgid=j[1],
                                   title=j[2],
                                   tags=j[3],
                                   ishunterjob=j[4])
            rep.append('add doc :' + str(j[0]))
        writer.commit()
        return rep

    def del_doc(self, id):
        self.ix.delete_by_term('id', id)
        return ['del doc :' + str(id) + '\r\n']

    def find_by_query(self, q, limit):
        jobs = self.ix.searcher().search(q, limit=limit)
        return jobs

    def find_unique_orgid(self, q, limit):
        facet = sorting.FieldFacet("id", reverse=True)
        jobs = self.ix.searcher().search(q,
                                         collapse="orgid",
                                         sortedby=facet,
                                         limit=limit)
        return jobs

    def find_all(self, limit):
        qp = QueryParser("id", schema=self.ix.schema)
        q = qp.parse(u'*')
        return self.find_by_query(q, limit)

    def find_all_unique_orgid(self, limit):
        qp = QueryParser("id", schema=self.ix.schema)
        q = qp.parse(u'*')
        return self.find_unique_orgid(q, limit)

    def hunter_job(self, limit):
        qp = QueryParser("ishunterjob", schema=self.ix.schema)
        q = qp.parse(u'1')
        return self.find_unique_orgid(q, limit)

    def find(self, query, limit):
        query = query.strip()
        if len(query) == 0:
            query = u'*'
        searcher = self.ix.searcher()
        qp = QueryParser("tags", schema=self.ix.schema)
        q = qp.parse(query)
        return searcher.search(q, limit=limit)


#state 0  插入 链接
#state 1  插入 正文
#state 2  插入 标签

    def search_by_url(self, url, limit):
        pagetags = self.conn.pongo.pagetags
        pageurls = self.conn.pongo.pageurls
        url = unicode(url)
        one = pagetags.find_one({"_id": url}, {"tags": 1})
        if one:
            tags = one["tags"]
            return self.find(tags, limit)
        else:
            pageurls.insert({"_id": url})
            #return ['insert :'+url]
            return None

    def jobs2json(self, jobs):
        rep = {}
        rep["server"] = self.mac_address
        rep["state"] = True
        response = {}
        rep['response'] = response
        if jobs == None:
            response['totalCount'] = 0
            return rep
        response['totalCount'] = len(jobs)
        #response['usedTime']=jobs.runtime
        items = []
        for j in jobs:
            job = {}
            job['jobId'] = j['id']
            job['orgid'] = j['orgid']
            job['jobTitle'] = j['title']
            items.append(job)

        response['items'] = items
        return rep

    def search(self, query, limit, hunterjob, uniqueorgid):
        if hunterjob:
            return self.hunter_job(limit)
        elif uniqueorgid:
            return self.find_all_unique_orgid(limit)
        else:
            return self.find(query, limit)

    def cut(self, value):
        value = value.lower().replace('&nbsp', '')
        value = value.encode('UTF-8')
        terms = self.tagsParser.parse(value)
        v = {}
        for i in terms:
            v[i[0]] = i[1]
        return v.values()

    def get_cache(self, k):
        if k in self.cache:
            return self.cache[k]
        else:
            return None

    def add_cache(self, k, rep):
        self.cache[k] = rep

    def dispatch_hander(self, worker, frames):
        header = frames[2]
        data = frames[3]
        mkey = ''
        #走缓存出结果
        if header == 'search':
            m = md5()
            m.update(data)
            mkey = m.hexdigest()
            rep = self.get_cache(mkey)
            if rep != None:
                rep = json.dumps(rep)
                msg = [frames[0], frames[1], rep.encode('UTF-8')]
                worker.send_multipart(msg)
                logger.info('search get_cache:' + mkey)
                return
        #无缓存流程
        jdata = json.loads(data.replace("''", "0"), strict=False)
        action = jdata["action"]
        rep = 'request err :' + data
        if header == 'update' and action == "updateDoc":
            jobs = []
            for j in jdata['fields']:
                tags = self.cut(j['jobname'] + ' ' + j['description'])
                jobid = j['jobid']
                orgid = j['orgid']
                jobname = unicode(j['jobname'])
                tags = ' '.join(tags).decode('UTF-8')
                ishunterjob = j['ishunterjob']
                jobs.append((jobid, orgid, jobname, tags, ishunterjob))

            rep = self.add_doc(jobs)
        #remove
        #{"action":"removeDoc","name":"job","keyId":"64983"}
        if header == 'remove' and action == "removeDoc":
            keyid = jdata["keyId"]
            rep = self.del_doc(int(keyid))
        if header == 'search':
            size = jdata['output']["size"]
            if action == 'adv':
                referurl = jdata['q']["referurl"]
                if referurl in self.cache:
                    rep = self.cache[referurl]
                else:
                    rep = self.jobs2json(self.search_by_url(referurl, size))
                    self.cache[referurl] = rep
                logger.info('adv:' + referurl)
            elif action == 'searchJob':
                keyword = ''
                uniqueorgid = False
                hunterjob = False
                if jdata.has_key('filter'):
                    f = jdata['filter']
                    if f.has_key('uniqueKey'):
                        uniqueorgid = True
                    if f.has_key('jobflag'):
                        hunterjob = True
                if jdata.has_key('q') and jdata['q'].has_key('keyword'):
                    keyword = jdata['q']["keyword"]
                rep = self.jobs2json(
                    self.search(keyword, size, hunterjob, uniqueorgid))
                logger.info('searchJob:keyword[' + keyword + ']')
            elif action == 'all':  #所有职位
                rep = self.jobs2json(self.find_all(size))
                logger.info('search all')
            elif action == 'uniqueorgid':  #按orgid排重后的所有职位
                rep = self.jobs2json(self.find_all_unique_orgid(size))
                logger.info('search uniqueorgid')
            elif action == 'hunterjob':  #获取最新猎头数据
                rep = self.jobs2json(self.hunter_job(size))
                logger.info('search hunterjob')
            #搜索结果添加缓存
            self.add_cache(mkey, rep)
        rep = json.dumps(rep)
        msg = [frames[0], frames[1], rep.encode('UTF-8')]
        worker.send_multipart(msg)
Exemple #6
0
class ADIndex:

    def get_mac_address(self):
        node = uuid.getnode()
        mac = uuid.UUID(int = node).hex[-12:]
        return mac

    def __init__(self,indexdir):
        exists = index.exists_in(indexdir)
        if exists :
            self.ix =index.open_dir(indexdir)
        else:
            schema = Schema(title=TEXT(stored=True),
                    id=NUMERIC(unique=True,stored=True),
                    orgid=NUMERIC(stored=True),
                    ishunterjob=NUMERIC(stored=True),
                    tags=KEYWORD(stored=True)
                    )
            self.ix = create_in(indexdir, schema)
        self.mac_address=self.get_mac_address()
        self.conn = pymongo.Connection(config.MONGO_CONN)
        self.tagsParser = Trie(config.SKILL_FILE)
        self.cache = LRUCache(1024)
     
    def add_doc(self,jobs):
        writer = self.ix.writer()
        rep =[] 
        for j in jobs:
            writer.update_document(id=j[0],orgid=j[1],
                    title=j[2],tags=j[3]
                    ,ishunterjob=j[4])
            rep.append('add doc :'+str(j[0]))
        writer.commit()
        return rep

    def del_doc(self,id):
        self.ix.delete_by_term('id',id)
        return ['del doc :'+str(id)+'\r\n']
    def find_by_query(self,q,limit):
        jobs = self.ix.searcher().search(q,limit=limit)
        return jobs
    def find_unique_orgid(self,q,limit):
        facet = sorting.FieldFacet("id", reverse=True)
        jobs = self.ix.searcher().search(q,collapse="orgid",sortedby=facet,limit=limit)
        return jobs
    def find_all(self,limit):
        qp = QueryParser("id", schema=self.ix.schema)
        q = qp.parse(u'*')
        return self.find_by_query(q,limit)
    def find_all_unique_orgid(self,limit):
        qp = QueryParser("id", schema=self.ix.schema)
        q = qp.parse(u'*')
        return self.find_unique_orgid(q,limit)
    def hunter_job(self,limit):
        qp = QueryParser("ishunterjob", schema=self.ix.schema)
        q = qp.parse(u'1')
        return self.find_unique_orgid(q,limit)

    def find(self,query,limit):
        query = query.strip()
        if len(query) == 0:
            query =u'*'
        searcher=self.ix.searcher()
        qp = QueryParser("tags", schema=self.ix.schema)
        q = qp.parse(query)
        return searcher.search(q,limit=limit)
#state 0  插入 链接
#state 1  插入 正文
#state 2  插入 标签
    def search_by_url(self,url,limit):
        pagetags = self.conn.pongo.pagetags
        pageurls = self.conn.pongo.pageurls
        url = unicode(url)
        one = pagetags.find_one({"_id": url}, {"tags": 1})
        if one :
            tags = one["tags"]
            return self.find(tags,limit)
        else:
            pageurls.insert({"_id":url})
            #return ['insert :'+url]
            return None
    def jobs2json(self,jobs):
        rep = {}
        rep["server"] = self.mac_address
        rep["state"] = True
        response = {}
        rep['response'] = response
        if jobs == None:
            response['totalCount']=0
            return rep
        response['totalCount']=len(jobs)
        #response['usedTime']=jobs.runtime
        items=[]
        for j in jobs:
            job={}
            job['jobId'] = j['id'] 
            job['orgid'] = j['orgid']
            job['jobTitle'] = j['title']
            items.append(job) 

        response['items']=items
        return rep
    def search(self,query,limit,hunterjob,uniqueorgid):
        if hunterjob:
            return self.hunter_job(limit)
        elif uniqueorgid:
            return self.find_all_unique_orgid(limit)
        else:
            return self.find(query,limit)

    def cut(self,value):
        value=value.lower().replace('&nbsp','')
        value = value.encode('UTF-8')
        terms = self.tagsParser.parse(value)
        v = {}
        for i in terms:
            v[i[0]]=i[1]
        return v.values()

    def get_cache(self,k):
        if k in self.cache:
            return self.cache[k]
        else:
            return None
    def add_cache(self,k,rep):
        self.cache[k] = rep

    def dispatch_hander(self,worker,frames):
        header = frames[2]
        data = frames[3]
        mkey = ''
        #走缓存出结果
        if header == 'search':
            m = md5()
            m.update(data)
            mkey = m.hexdigest()
            rep = self.get_cache(mkey)
            if rep != None:
                rep = json.dumps(rep)
                msg = [frames[0],frames[1],rep.encode('UTF-8')]
                worker.send_multipart(msg)
                logger.info('search get_cache:'+mkey)
                return 
        #无缓存流程
        jdata = json.loads(data.replace("''","0"),strict=False)
        action = jdata ["action"]
        rep = 'request err :'+data
        if header == 'update' and action == "updateDoc":
            jobs=[]
            for j in jdata['fields']:
                tags = self.cut(j['jobname']+' '+j['description'])
                jobid = j['jobid']
                orgid = j['orgid']
                jobname = unicode(j['jobname'])
                tags = ' '.join(tags).decode('UTF-8')
                ishunterjob=j['ishunterjob']
                jobs.append((jobid,orgid,jobname,tags,ishunterjob))

            rep = self.add_doc(jobs)
        #remove
        #{"action":"removeDoc","name":"job","keyId":"64983"}
        if header == 'remove' and action == "removeDoc":
            keyid = jdata ["keyId"]
            rep =self.del_doc(int(keyid))
        if header == 'search':
            size = jdata['output']["size"]
            if action == 'adv':
                referurl = jdata['q']["referurl"]
                if referurl in self.cache:
                    rep = self.cache[referurl]
                else:
                    rep = self.jobs2json(self.search_by_url(referurl,size))
                    self.cache[referurl] = rep
                logger.info('adv:'+referurl)
            elif action == 'searchJob':
                keyword = ''
                uniqueorgid = False
                hunterjob = False
                if jdata.has_key('filter'):
                    f = jdata['filter']
                    if f.has_key('uniqueKey'):
                        uniqueorgid = True
                    if f.has_key('jobflag'):
                        hunterjob = True
                if jdata.has_key('q') and jdata['q'].has_key('keyword'):
                    keyword = jdata['q']["keyword"]
                rep = self.jobs2json(self.search(keyword,size,hunterjob,uniqueorgid))
                logger.info('searchJob:keyword['+keyword+']')
            elif action == 'all' :#所有职位
                rep = self.jobs2json(self.find_all(size))
                logger.info('search all')
            elif action == 'uniqueorgid': #按orgid排重后的所有职位
                rep = self.jobs2json(self.find_all_unique_orgid(size))
                logger.info('search uniqueorgid')
            elif action == 'hunterjob':#获取最新猎头数据
                rep = self.jobs2json(self.hunter_job(size))
                logger.info('search hunterjob')
            #搜索结果添加缓存
            self.add_cache(mkey,rep)
        rep = json.dumps(rep)
        msg = [frames[0],frames[1],rep.encode('UTF-8')]
        worker.send_multipart(msg)
Exemple #7
0
 def __init__(self, task_queue, ix):
     Process.__init__(self)
     self.task_queue = task_queue
     self.ix = ix
     self.tagsParser = Trie(config.SKILL_FILE)
Exemple #8
0
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import pymongo
from ac_trie import Trie
import pongoclient
import json

tagsParser = Trie('./dict/skill.txt')

conn = pymongo.Connection(host='192.168.4.216', port=19753)
article = conn.tags.article


def cut(value):
    value = value.lower().replace('&nbsp', '')
    value = value.encode('UTF-8')
    terms = tagsParser.parse(value)
    v = {}
    for i in terms:
        v[i[0]] = i[1]
    return v.values()


def ad_query(k):
    request = '{ "action" : "searchJob" , "q" : { "keyword" : "' + k + '"} , "sort" : 1 , "output" : { "format" : "json" , "offset" : 0 , "size" : 10}}'
    return pongoclient.send_request('search', request)

Exemple #9
0
 def __init__(self, task_queue, ix):
     Process.__init__(self)
     self.task_queue = task_queue
     self.ix = ix
     self.tagsParser = Trie(config.SKILL_FILE)