Ejemplo n.º 1
0
    def get_significant_terms(self, count, filters):
        queries = Q()

        for f in filters:
            queries = queries & Q("match_phrase", description=f)

        queries = queries & Q("range", posted_date={'gte': self.start})
        search = JobListingDocument.search().query(queries)
        search.aggs.bucket('word_count', 'significant_terms', field='keywords')
        search = search.execute()
        aggs = search.aggregations.word_count
        print(aggs)
        return aggs.to_dict()
Ejemplo n.º 2
0
 def calculate_trend_totals(self, queries, period):
     total_search = JobListingDocument.search().query(queries)
     total_search.aggs.bucket('listings_per_day',
                              'date_histogram',
                              field='posted_date',
                              interval=period)
     total_search = total_search.execute()
     total_buckets = total_search.aggregations.listings_per_day.buckets
     total_y = {}
     for bucket in total_buckets:
         if bucket.key >= self.start:
             total_y[bucket.key] = bucket.doc_count
     return total_y
Ejemplo n.º 3
0
def test_search(keyword):
    Q("match_phrase", description=keyword)
    listings_search = JobListingDocument.search().query(Q("match_phrase", description=keyword)).execute()
    result = listings_search.to_dict()
    return result['_shards']['successful'] == 1
Ejemplo n.º 4
0
 def calculate_bar_totals(self, queries):
     total_search = JobListingDocument.search().query(queries)
     return total_search.count()
Ejemplo n.º 5
0
    def get_job_listings(self, count, filters, companies, titles, locations):
        if not count:
            count = 5

        queries = Q()
        queries = queries & Q(
            "range",
            posted_date={
                'gte': str(
                    datetime.utcfromtimestamp(self.start / 1000).date())
            })
        # apply filters
        for f in filters:
            queries = queries & Q("match_phrase", description=f)

        if titles:
            title_queries = Q("match_phrase", title=titles[0])
            remaining_titles = titles[1:]
            for title in remaining_titles:
                title_queries = title_queries | Q("match_phrase", title=title)
            queries = queries & title_queries

        if companies:
            company_queries = Q("match_phrase", company=companies[0])
            remaining_companies = companies[1:]
            for company in remaining_companies:
                company_queries = company_queries | Q("match_phrase",
                                                      company=company)
            queries = queries & company_queries

        if locations:
            location_queries = Q("match_phrase", location=locations[0])
            remaining_locations = locations[1:]
            for location in remaining_locations:
                location_queries = location_queries | Q("match_phrase",
                                                        location=location)
            queries = queries & location_queries

        # Get array of data for each keyword
        listings_search = JobListingDocument.search().query(queries).sort(
            '-posted_date')
        listings_search = listings_search.execute()
        job_listing_documents = listings_search[:count]
        job_listings = []
        for doc in job_listing_documents:
            job_listings += [{
                'indeed_id': doc.indeed_id,
                'posted_date': doc.posted_date.isoformat(),
                'title': doc.title,
                'location': doc.location,
                'company': doc.company,
                'description': doc.description
            }]

        return {
            'job_listings': job_listings,
            'filters': filters,
            'companies': companies,
            'titles': titles,
            'locations': locations
        }
Ejemplo n.º 6
0
    def get_top_locations(self,
                          count,
                          filters,
                          companies,
                          titles,
                          include=None):
        queries = Q()

        for f in filters:
            queries = queries & Q("match_phrase", description=f)

        if titles:
            title_queries = Q("match_phrase", title=titles[0])
            remaining_titles = titles[1:]
            for title in remaining_titles:
                title_queries = title_queries | Q("match_phrase", title=title)
            queries = queries & title_queries

        if companies:
            company_queries = Q("match_phrase", company=companies[0])
            remaining_companies = companies[1:]
            for company in remaining_companies:
                company_queries = company_queries | Q("match_phrase",
                                                      company=company)
            queries = queries & company_queries

        queries = queries & Q("range", posted_date={'gte': self.start})
        search = JobListingDocument.search().params(
            request_timeout=60).query(queries)
        search.aggs.bucket('word_count',
                           'terms',
                           field='location_keywords',
                           size=500)
        search.aggs.bucket('shingle_word_count',
                           'terms',
                           field='location_shingles',
                           size=500)
        #search.aggs.bucket('triple_shingle_word_count', 'terms', field='location_triple_shingles', size=500)
        search = search.execute()
        words = search.aggregations.word_count.to_dict()['buckets']
        shingle_words = search.aggregations.shingle_word_count.to_dict(
        )['buckets']
        #triple_shingle_words = search.aggregations.triple_shingle_word_count.to_dict()['buckets']

        words = words + shingle_words  # + triple_shingle_words
        locations = []
        cityObjs = []

        include = open("jobTrends/location_data/col.json", "r")
        include = json.load(include)
        cityNamesFilter = []
        for city in include.keys():
            cityObjs.append({city.split(',')[0].lower(): city})
            cityNamesFilter.append(city.split(',')[0].lower())
        #print(cityNamesFilter)
        for word in words:
            if word['key'] in cityNamesFilter:
                #locations.append(word)
                for city in cityObjs:
                    #print("%s, %s", city.keys(), word['key'])
                    if list(city)[0] == word['key']:
                        locations.append({
                            "city": city[list(city)[0]],
                            "doc_count": word['doc_count']
                        })
        #print(locations)

        locations = sorted(locations,
                           key=lambda k: k['doc_count'],
                           reverse=True)

        return locations
Ejemplo n.º 7
0
    def get_top_skills(self,
                       count,
                       filters,
                       companies,
                       titles,
                       locations,
                       include=None):
        queries = Q()

        if not count:
            count = 10

        for f in filters:
            queries = queries & Q("match_phrase", description=f)

        if titles:
            title_queries = Q("match_phrase", title=titles[0])
            remaining_titles = titles[1:]
            for title in remaining_titles:
                title_queries = title_queries | Q("match_phrase", title=title)
            queries = queries & title_queries

        if companies:
            company_queries = Q("match_phrase", company=companies[0])
            remaining_companies = companies[1:]
            for company in remaining_companies:
                company_queries = company_queries | Q("match_phrase",
                                                      company=company)
            queries = queries & company_queries

        if locations:
            location_queries = Q("match_phrase", location=locations[0])
            remaining_locations = locations[1:]
            for location in remaining_locations:
                location_queries = location_queries | Q("match_phrase",
                                                        location=location)
            queries = queries & location_queries

        filter_terms = [] + filters + titles + companies + locations

        queries = queries & Q("range", posted_date={'gte': self.start})
        search = JobListingDocument.search().params(
            request_timeout=60).query(queries)
        search.aggs.bucket('word_count', 'terms', field='keywords', size=1500)
        search.aggs.bucket('shingle_word_count',
                           'terms',
                           field='shingles',
                           size=1500)
        #search.aggs.bucket('triple_shingle_word_count', 'terms', field='triple_shingles', size=1500)
        search = search.execute()
        words = search.aggregations.word_count.to_dict()['buckets']
        shingle_words = search.aggregations.shingle_word_count.to_dict(
        )['buckets']
        #triple_shingle_words = search.aggregations.triple_shingle_word_count.to_dict()['buckets']

        words = words + shingle_words  # + triple_shingle_words
        skills = []
        whitelist = open("jobTrends/word_lists/whitelist.txt", "r")
        whitelist = whitelist.read().split('\n')
        whitelist = [skill.lower() for skill in whitelist]
        for word in words:
            if (word['key'] in whitelist) & (word['key'] not in filters):
                skills.append(word)
        skills = sorted(skills, key=lambda k: k['doc_count'], reverse=True)
        return skills[:count]
Ejemplo n.º 8
0
    def get_trend_data(self, filters, keywords, raw, period, companies, titles,
                       locations):
        all_x = []
        all_y = []
        all_keywords = []
        total_y = []

        if not keywords:
            raise ValueError(
                "get_trend_data must be called with at least one keywords arg")

        queries = Q()
        # apply filters
        for f in filters:
            queries = queries & Q("match_phrase", description=f)

        if titles:
            title_queries = Q("match_phrase", title=titles[0])
            remaining_titles = titles[1:]
            for title in remaining_titles:
                title_queries = title_queries | Q("match_phrase", title=title)
            queries = queries & title_queries

        if companies:
            company_queries = Q("match_phrase", company=companies[0])
            remaining_companies = companies[1:]
            for company in remaining_companies:
                company_queries = company_queries | Q("match_phrase",
                                                      company=company)
            queries = queries & company_queries

        if locations:
            location_queries = Q("match_phrase", location=locations[0])
            remaining_locations = locations[1:]
            for location in remaining_locations:
                location_queries = location_queries | Q("match_phrase",
                                                        location=location)
            queries = queries & location_queries

        # calculate totals in order to display percentages
        if raw != '1':
            total_y = self.calculate_trend_totals(queries, period)

        # Get array of data for each keyword

        max_length = 0
        max_dates = []
        for keyword in keywords:
            query = queries & Q("match_phrase", description=keyword)
            listings_search = JobListingDocument.search().query(query)

            listings_search.aggs.bucket('listings_per_day',
                                        'date_histogram',
                                        field='posted_date',
                                        interval=period)
            listings_search = listings_search.execute()

            x = []
            y = []
            idx = 0
            for bucket in listings_search.aggregations.listings_per_day.buckets:
                if bucket.key >= self.start:
                    x.append(
                        datetime.utcfromtimestamp(bucket.key /
                                                  1000).isoformat())
                    if raw != '1':
                        try:
                            y.append(bucket.doc_count / total_y[bucket.key])
                        except ZeroDivisionError:
                            y.append(0)
                    else:
                        y.append(bucket.doc_count)
                    idx += 1
            all_x.append(x)
            all_y.append(y)
            all_keywords.append(keyword)
            if len(x) > max_length:
                max_length = len(x)
                max_dates = x

        # make sure all arrays have values for all dates
        idx = 0
        for keyword in keywords:
            while len(all_y[idx]) < max_length:
                all_y[idx].append(0)
            all_x[idx] = max_dates
            idx += 1
        return {
            'x': all_x,
            'y': all_y,
            'keywords': all_keywords,
            'raw': raw,
            'filters': filters,
            'period': period,
            'companies': companies,
            'titles': titles,
            'locations': locations
        }
Ejemplo n.º 9
0
    def get_bar_data(self, filters, keywords, raw, companies, titles,
                     locations):

        all_y = []
        all_keywords = []
        total_y = 0

        if not keywords:
            raise ValueError(
                "get_bar_data must be called with at least one keywords arg")

        queries = Q()
        queries = queries & Q(
            "range",
            posted_date={
                'gte': str(
                    datetime.utcfromtimestamp(self.start / 1000).date())
            })
        # apply filters
        for f in filters:
            queries = queries & Q("match_phrase", description=f)

        if titles:
            title_queries = Q("match_phrase", title=titles[0])
            remaining_titles = titles[1:]
            for title in remaining_titles:
                title_queries = title_queries | Q("match_phrase", title=title)
            queries = queries & title_queries

        if companies:
            company_queries = Q("match_phrase", company=companies[0])
            remaining_companies = companies[1:]
            for company in remaining_companies:
                company_queries = company_queries | Q("match_phrase",
                                                      company=company)
            queries = queries & company_queries

        if locations:
            location_queries = Q("match_phrase", location=locations[0])
            remaining_locations = locations[1:]
            for location in remaining_locations:
                location_queries = location_queries | Q("match_phrase",
                                                        location=location)
            queries = queries & location_queries

        # calculate totals in order to display percentages
        if raw != '1':
            total_y = self.calculate_bar_totals(queries)

        # Get array of data for each keyword
        for keyword in keywords:
            query = queries & Q("match_phrase", description=keyword)
            listings_search = JobListingDocument.search().query(query)
            y = listings_search.count()
            if raw != '1':
                try:
                    y = y / total_y
                except ZeroDivisionError:
                    y = 0
            all_y.append(y)
            all_keywords.append(keyword)
        return {
            'y': all_y,
            'keywords': all_keywords,
            'raw': raw,
            'filters': filters,
            'companies': companies,
            'titles': titles,
            'locations': locations
        }