コード例 #1
0
ファイル: getjobinfo.py プロジェクト: folagit/resumatcher
def main():

    collectionName = "job_lang_top_corps"
    infoCollectionName = "jobinfo_lang_top_corps"

    dbClient = DbClient('localhost', 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    infoCollection = dbClient.getCollection(infoCollectionName)
    getter = IndeedPageGetter(infoCollection)

    pageSize = 10
    pageNo = 149
    has_more = True
    pageNum = 10000
    find_sort = None
    find_spec = None
    while has_more and pageNo <= pageNum:
        page = dbClient.getPage(collection, find_spec, find_sort, pageSize,
                                pageNo)
        getter.processPage(page, pageNo)
        pageNo += 1
        count = page.count(with_limit_and_skip=True)
        #   print "count=",count
        if (count < pageSize):
            has_more = False
コード例 #2
0
ファイル: collutils.py プロジェクト: folagit/resumatcher
def main(): 
    targetDb = "jobaly"
    targetClient = DbClient('localhost', 27017, targetDb) 
    srcDb = "jobaly_daily" 
    srcClient = DbClient('localhost', 27017, srcDb)
    
    targetCollName = "job1000"     
    srcCollnames = "daily_job_info_2014-06-16"
    
    srcColl = srcClient.getCollection(srcCollnames)
    targetColl = targetClient.getCollection(targetCollName)
    
    size = 1000 
    copyCollection(srcColl, targetColl, size)
コード例 #3
0
def main():
    cities = [
        'MoutainView, CA', 'Seattle, WA', 'San Diego, CA', 'San Francisco, CA',
        'Austin, TX', 'San Jose, CA', 'Portland, OR', ' New York, NY',
        'Houston, TX', 'Boston, MA', 'Davis, CA', 'Palo Alto, CA',
        ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA'
    ]

    cities = [
        'Austin, TX', 'San Jose, CA', 'Portland, OR', ' New York, NY',
        'Houston, TX', 'Boston, MA', 'Davis, CA', 'Palo Alto, CA',
        ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA'
    ]

    _pageSize = 25
    _fromage = 30
    _location = 94040
    _radius = 25
    _query = "software engineer"

    collectionName = "job_se_10city"
    indeedClient = ApiClient(_query, _pageSize, _fromage, _location, _radius)
    # client.getPage(0)
    dbClient = DbClient('localhost', 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    for city in cities:
        print "-----prcoss city %s -------" % city
        indeedClient.processCity(collection, city)
コード例 #4
0
ファイル: joblist_getter.py プロジェクト: pkushiqiang/jobaly
def getByCities():

    cities = [
        "Austin, TX",
        "San Jose, CA",
        "Portland, OR",
        " New York, NY",
        "Houston, TX",
        "Boston, MA",
        "Davis, CA",
        "Palo Alto, CA",
        " Irvine, CA",
        "Olathe, KS",
        "Columbia, MD",
        " Atlanta, GA",
    ]

    param = {"q": "software engineer", "fromage": "30"}

    collectionName = "job_se_10city"
    indeedClient = ApiClient(param)
    # client.getPage(0)
    dbClient = DbClient("localhost", 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    for city in cities:
        print "-----prcoss city %s -------" % city
        indeedClient.processQuery(collection, "l", city)
コード例 #5
0
ファイル: multiget.py プロジェクト: folagit/resumatcher
def main():

    collectionName = "job_se_10city"
    infoCollectionName = "jobinfo_se_10city"

    collectionName = "job_lang_top_corps"
    infoCollectionName = "jobinfo_lang_top_corps"

    dbClient = DbClient('localhost', 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    infoCollection = dbClient.getCollection(infoCollectionName)

    pageSize = 20
    pageNo = 1
    has_more = True
    pageNum = 10000
    find_sort = None
    find_spec = None

    threadNum = 20
    queue = Queue.Queue()
    for i in range(threadNum):
        t = JobGetter(queue, infoCollection)
        t.setDaemon(True)
        t.start()

    while has_more and pageNo <= pageNum:
        page = dbClient.getPage(collection, find_spec, find_sort, pageSize,
                                pageNo)
        queue.put((page, pageNo))
        pageNo += 1
        count = page.count(with_limit_and_skip=True)
        #   print "count=",count
        if (count < pageSize):
            has_more = False

    queue.join()
コード例 #6
0
ファイル: multiget.py プロジェクト: folagit/resumatcher
def main():
     
     collectionName = "job_se_10city"
     infoCollectionName = "jobinfo_se_10city"
     
     collectionName = "job_lang_top_corps"
     infoCollectionName = "jobinfo_lang_top_corps"
    
     dbClient = DbClient('localhost', 27017, "jobaly")
     collection = dbClient.getCollection(collectionName)
     infoCollection = dbClient.getCollection(infoCollectionName)
     
     pageSize = 20 
     pageNo = 1
     has_more = True
     pageNum = 10000
     find_sort = None
     find_spec=None

     threadNum = 20
     queue = Queue.Queue()
     for i in range(threadNum):
        t = JobGetter(queue,infoCollection)
        t.setDaemon(True)
        t.start()     
     
     while has_more and pageNo <= pageNum :
        page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo)    
        queue.put( (page,pageNo) )       
        pageNo+=1 
        count =  page.count(with_limit_and_skip = True)
     #   print "count=",count
        if ( count < pageSize ) :
            has_more = False
            
     queue.join()   
コード例 #7
0
ファイル: getjobinfo.py プロジェクト: folagit/resumatcher
def main():
     
     collectionName = "job_lang_top_corps"
     infoCollectionName = "jobinfo_lang_top_corps"
    
     dbClient = DbClient('localhost', 27017, "jobaly")
     collection = dbClient.getCollection(collectionName)
     infoCollection = dbClient.getCollection(infoCollectionName)
     getter = IndeedPageGetter(infoCollection)

     pageSize = 10 
     pageNo = 149
     has_more = True
     pageNum = 10000
     find_sort = None
     find_spec=None
     while has_more and pageNo <= pageNum :
        page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo)    
        getter.processPage(page,pageNo)        
        pageNo+=1 
        count =  page.count(with_limit_and_skip = True)
     #   print "count=",count
        if ( count < pageSize ) :
            has_more = False
コード例 #8
0
def main():
    pageSize = 100
    startPageNo = 13
    endPageNo = 10000
    dbClient = DbClient('localhost', 27017, "SimilarQuestion")
    collection = dbClient.getCollection("question_test")
    
    questionGetter = QuestionGetter(pageSize,"python")
    for  pg in range(startPageNo, endPageNo):
        print "--get page at : %d -----" % pg
        items = questionGetter.getPage(pg)
        if items == "NO_ITEMS":
            break
        print "--page at : %d have %d questions--" % (pg, len(items))
        questionGetter.savePage(collection,items)   
        time.sleep(10)
コード例 #9
0
def getByCities():

    cities = [
        'Austin, TX', 'San Jose, CA', 'Portland, OR', ' New York, NY',
        'Houston, TX', 'Boston, MA', 'Davis, CA', 'Palo Alto, CA',
        ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA'
    ]

    param = {"q": "software engineer", "fromage": "30"}

    collectionName = "job_se_10city"
    indeedClient = ApiClient(param)
    # client.getPage(0)
    dbClient = DbClient('localhost', 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    for city in cities:
        print "-----prcoss city %s -------" % city
        indeedClient.processQuery(collection, "l", city)
コード例 #10
0
def getByCorps():
    print " --- get job by companies---"
    collectionName = "job_se_top_corps"
    param = {"q": "software engineer", "fromage": "30"}
    indeedClient = ApiClient(param)
    # client.getPage(0)
    dbClient = DbClient('localhost', 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    corps = []
    fileName = "topcorps.txt"
    with open(fileName, 'r') as the_file:
        for line in the_file:
            word = line.strip()
            if not len(word) == 0:
                corps.append(word)

    for corp in corps:
        q = indeedClient.buildQuery("software engineer", {"company": corp})
        print "-----prcoss corp %s -------" % corp
        indeedClient.processQuery(collection, "q", q)
コード例 #11
0
ファイル: joblist_getter.py プロジェクト: pkushiqiang/jobaly
def getByCorps():
    print " --- get job by companies---"
    collectionName = "job_se_top_corps"
    param = {"q": "software engineer", "fromage": "30"}
    indeedClient = ApiClient(param)
    # client.getPage(0)
    dbClient = DbClient("localhost", 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    corps = []
    fileName = "topcorps.txt"
    with open(fileName, "r") as the_file:
        for line in the_file:
            word = line.strip()
            if not len(word) == 0:
                corps.append(word)

    for corp in corps:
        q = indeedClient.buildQuery("software engineer", {"company": corp})
        print "-----prcoss corp %s -------" % corp
        indeedClient.processQuery(collection, "q", q)
コード例 #12
0
ファイル: getjob_lang.py プロジェクト: folagit/resumatcher
def getByLang():
    
    print " --- get job by language and companies---"
    collectionName = "job_lang_top_corps"
    param = { "q" : "software engineer", 
               "fromage" : "30"    }    
               
    lang_names = utils.loadArrayFromFile("pro_langs.txt")
    corps_names = utils.loadArrayFromFile("topcorps.txt")
    
    indeedClient= ApiClient( param )
    # client.getPage(0)
    dbClient = DbClient('localhost', 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    
    
    for corp in corps_names:
       for lang in lang_names:
           q = indeedClient.buildQuery(lang, {"company": corp })
           print "-----prcoss corp %s with language %s -------" % (corp, lang) 
           indeedClient.processQuery(collection, "q", q)
コード例 #13
0
def main():
    pageSize = 100
    startPageNo = 1
    endPageNo = 10000
    dbClient = DbClient('localhost', 27017, "SimilarQuestion")
    collection = dbClient.getCollection("english_questions")
    
    questionGetter = QuestionGetter(pageSize,"")
    for  pg in range(startPageNo, endPageNo):
        print "--- get page %d ---" %pg
        items = questionGetter.getPage(pg)

     #   print items
        if ( items == "NO_MORE" ) :
            print "have no more questions, quit program !!"
            break
        
        print "--- page %d has %d questions ---" %(pg,len(items))
        if ( items != "NO_ITEMS" ) :
           i = questionGetter.savePage(collection,items) 
        print "--- page %d has save %d question " %(pg,i)
コード例 #14
0
def main(): 
    collectionName = "job_lang_top_corps"
    dbClient = DbClient('localhost', 27017, "jobaly")
    collection = dbClient.getCollection(collectionName)
    
    title_dict = {}
    for job in collection.find():
        # print job["_id"], job["jobtitle"]
        title =  job["jobtitle"]
        if title_dict.has_key(title): 
            title_dict[title] += 1
        else :
            title_dict[title] = 1
    
    stat_file_name =  "jobtitle_stat.txt"  
    with open( stat_file_name , "w") as text_file:   
        i = 0 
        for (key, value) in sorted(title_dict.iteritems(), key=operator.itemgetter(1), reverse = True):
        #     print key, ":", value 
             text_file.write("%s : %s \n" % (key.encode('utf8'),value)) 
             i+=1
        print i, " lines had been writen into file:", stat_file_name
コード例 #15
0
ファイル: joblistgetter.py プロジェクト: folagit/resumatcher
def main():
     cities = ['MoutainView, CA', 'Seattle, WA', 'San Diego, CA', 'San Francisco, CA', 'Austin, TX',
               'San Jose, CA','Portland, OR',' New York, NY','Houston, TX','Boston, MA', 
               'Davis, CA', 'Palo Alto, CA', ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA' ]
     

     cities = [ 'Austin, TX',
               'San Jose, CA','Portland, OR',' New York, NY','Houston, TX','Boston, MA', 
               'Davis, CA', 'Palo Alto, CA', ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA' ]
            
     _pageSize = 25 
     _fromage = 30 
     _location = 94040
     _radius = 25
     _query = "software engineer"
     
     collectionName = "job_se_10city"
     indeedClient= ApiClient(_query, _pageSize, _fromage, _location, _radius )
    # client.getPage(0)
     dbClient = DbClient('localhost', 27017, "jobaly")
     collection = dbClient.getCollection(collectionName)
     for city in cities:
         print "-----prcoss city %s -------" %city
         indeedClient.processCity(collection,city)
コード例 #16
0
class DataProcessor:
    def __init__(self):
        self.dbClient = DbClient("localhost", 27017, "SimilarQuestion")

    @staticmethod
    def processQuestion(question):
        a = {}
        a["qid"] = question["_id"]
        a["title"] = question["title"]
        return a

    @staticmethod
    def processLinkedQuestion(question):
        a = {}
        a["qid"] = question["_id"]
        a["title"] = question["title"]
        a["linked"] = []
        for item in question["items"]:
            b = {}
            b["qid"] = item["question_id"]
            b["title"] = item["title"]
            print b
            a["linked"].append(b)
        return a

    @staticmethod
    def processLinkedQuestion2(question):
        a = {}
        a["qid"] = question["_id"]
        a["linked"] = []
        for item in question["items"]:
            a["linked"].append(item["question_id"])
        return a

    @staticmethod
    def processRelatedQuestion(question):
        a = {}
        a["qid"] = question["_id"]
        a["title"] = question["title"]
        a["related"] = []
        for item in question["items"]:
            b = {}
            b["qid"] = item["question_id"]
            b["title"] = item["title"]
            #    print b
            a["related"].append(b)
        return a

    def dumpDataToFile(self, queFun, collection, find_spec, find_sort, fileName, pageNum):
        pageSize = 1000
        pageNo = 1
        has_more = True
        with open(fileName, "w") as the_file:
            # the_file.write('Hello\n')
            while has_more and pageNo <= pageNum:
                page = self.dbClient.getPage(collection, find_spec, find_sort, pageSize, pageNo)
                pageNo += 1
                count = page.count(with_limit_and_skip=True)
                print "count=", count
                if count < pageSize:
                    has_more = False
                for item in page:
                    a = queFun(item)
                    jstr = json.dumps(a) + "\n"
                    the_file.write(jstr)
                print " page %d saved %d lines in file" % (pageNo - 1, count)

    def dumpPythonQuestions(self, pageNum):
        question_coll = self.dbClient.getCollection("question_test")
        fileName = "..\..\data\pyton_questions.txt"
        self.dumpDataToFile(DataProcessor.processQuestion, question_coll, fileName, pageNum)

    def dumpLinkedQuestions(self, pageNum):
        question_coll = self.dbClient.getCollection("question_link_python")
        fileName = "..\..\data\question_link_python.txt"
        find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 5"}
        find_sort = {"items": {"$size": -1}}

        self.dumpDataToFile(DataProcessor.processLinkedQuestion, question_coll, find_spec, find_sort, fileName, pageNum)

    def dumpLinkedQuestions2(self, pageNum):
        question_coll = self.dbClient.getCollection("question_link_python")
        fileName = "..\..\data\python_linked.txt"
        find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 1"}
        find_sort = {"items": {"$size": -1}}
        self.dumpDataToFile(
            DataProcessor.processLinkedQuestion2, question_coll, find_spec, find_sort, fileName, pageNum
        )

    def dumpRelatedQuestions(self, pageNum):
        question_coll = self.dbClient.getCollection("related_python")
        fileName = "..\..\data\question_related_python.txt"
        find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 5"}
        find_sort = None

        self.dumpDataToFile(
            DataProcessor.processRelatedQuestion, question_coll, find_spec, find_sort, fileName, pageNum
        )