def update_probability_dict(dict_file, new_dict_file_list):
     probability_dict = StoreHelper.load_data(dict_file, {})
     for dict_file in new_dict_file_list:
         new_dict = StoreHelper.load_data(dict_file, {})
         print("Get %s with records: %i" % (dict_file, len(new_dict)))
         DictHelper.update_dict(probability_dict, new_dict)
     StoreHelper.store_data(probability_dict, dict_file)
    def crawl_post_information(ids_file, save_file):
        id_list = StoreHelper.load_data(ids_file)
        continue_not_found = 0
        post_list = {}
        total_count = len(id_list)
        current = 0
        for ids in id_list:
            id_url = urlparse.urljoin("https://www.linkedin.com/jobs/view/",
                                      ids)
            print("Working on url: %s" % id_url)
            current += 1
            print("progress report: %i in %i for %s" %
                  (current, total_count, ids_file))

            web_source = CrawlHelper.get_web_source(id_url)
            company = CrawlHelper.get_company_name(web_source)
            post_content = CrawlHelper.get_post_information(web_source)

            if post_content is None:
                print("No skills found for %s! Continue times %i" %
                      (id_url, continue_not_found))
                continue_not_found += 1
                if continue_not_found > 3:
                    break
            else:
                continue_not_found = 0
                if company in post_list.keys():
                    post_list[company].append((company, id_url, post_content))
                else:
                    post_list[company] = [(company, id_url, post_content)]
        StoreHelper.store_data(post_list, save_file)
        return current >= total_count - 1
    def convert_profile2(debug=False):
        education_phrase_dic = StoreHelper.load_data(
            '../resource/education.dat')
        discipline_phrase_dic = StoreHelper.load_data(
            '../resource/discipline.dat')
        skills_dic = StoreHelper.load_data('../resource/skills.dat')
        profile_vectors = StoreHelper.load_data(
            '../resource/United States/profile.dat', [])
        university_name_convert_dict = StoreHelper.load_data(
            '../university_name_convert.dic', {})
        vector_list = []

        count = 0
        total = len(profile_vectors)
        for _profile in profile_vectors:
            count += 1
            if debug:
                print("Profile convert progress: %d/%d" % (count, total))
            educations, majors = ProfileHelper.get_highest_education(
                _profile, education_phrase_dic, discipline_phrase_dic)
            profile_dict = {
                'skills':
                ProfileHelper.get_skills(_profile, skills_dic),
                'work_change_times':
                ProfileHelper.calculate_years(_profile)[0],
                'years':
                ProfileHelper.calculate_years(_profile)[1],
                'university':
                ProfileHelper.convert_university(_profile,
                                                 university_name_convert_dict),
                'education':
                educations,
                'company': [
                    SegmentHelper.normalize(company)
                    for company in _profile['company']
                ],
                'major':
                majors
            }
            vector_list.append(profile_dict)
        StoreHelper.store_data(vector_list, '../resource/convert_profile.dat')
        StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
 def split_dict():
     phase_dict = StoreHelper.load_data("phase_dict.dat", {})
     phase_dict_single = {}
     phase_dict_double = {}
     for key, value in phase_dict.items():
         if '_' in key:
             phase_dict_double[key] = value
         else:
             phase_dict_single[key] = value
     StoreHelper.save_file(DictHelper.get_sorted_list(phase_dict_single), 'phase_dict_single.txt')
     StoreHelper.save_file(DictHelper.get_sorted_list(phase_dict_double), 'phase_dict_double.txt')
 def generate_phrase_dict():
     sentence_stream = StoreHelper.load_data('sentence_stream.dat', [])
     phrases = Phrases(sentence_stream, min_count=2, threshold=2)
     bi_gram = Phraser(phrases)
     for i in range(8535):
         text_file = "../data/clean_post_lemmatize/%04d.dat" % i
         output_file = "../data/gensim_split/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("Working on %s" % text_file)
             phrase_list = GensimHelper.phrase_detection(bi_gram, text_file)
             phrase_list = [phrase.replace('_', ' ') for phrase in phrase_list]
             StoreHelper.store_data(phrase_list, output_file)
 def merge_dict():
     profile_dict_list = StoreHelper.load_data(
         '../resource/convert_profile.dat', [])
     merged_list = []
     for profile_dict in profile_dict_list:
         merged_dict = {}
         for feature in profile_dict:
             for key in profile_dict[feature]:
                 DictHelper.increase_dic_key(merged_dict, key)
         merged_list.append(merged_dict)
     StoreHelper.store_data(merged_list, '../resource/merged_profile.dat')
     StoreHelper.save_file(merged_list, '../resource/merged_profile.txt')
Beispiel #7
0
 def calculate_full_frequency():
     html_list = StoreHelper.load_data("../data/post/Delaware.dat", [])
     words_frequency_list = []
     for _url, _web_source in html_list:
         clean_content = HTMLHelper.get_text(_web_source)
         text_dict = WordFrequency.get_frequency_dict(clean_content)
         text_dict = sorted(text_dict.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
         words_frequency_list.append(text_dict)
     for text_dict in words_frequency_list:
         print(text_dict)
    def convert_profile():
        education_phrase_dic = StoreHelper.load_data(
            '../resource/education.dat')
        discipline_phrase_dic = StoreHelper.load_data(
            '../resource/discipline.dat')
        skills_dic = StoreHelper.load_data('../resource/skills.dat')

        profile_vectors = StoreHelper.load_data(
            '../resource/United States/profile.dat', [])
        vector_list = []
        for _profile in profile_vectors:
            educations, majors = ProfileHelper.get_highest_education(
                _profile, education_phrase_dic, discipline_phrase_dic)
            profile_dict = {
                'skills': ProfileHelper.get_skills(_profile, skills_dic),
                'years': ProfileHelper.get_years(_profile),
                'education': educations,
                'major': majors
            }
            vector_list.append(profile_dict)
        StoreHelper.store_data(vector_list, '../resource/convert_profile.dat')
        StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
Beispiel #9
0
 def _get_working_year_words(self, year_convert_file=None):
     year_list = TextHelper.get_years_pattern(self.raw_position)
     if len(year_list) == 0:
         default_year_requirement = "[0]"
         self.new_words_list.append(default_year_requirement)
         year_list = [default_year_requirement]
     elif year_convert_file is not None:
         year_convert_dict = StoreHelper.load_data(year_convert_file, {})
         year_list = [
             year_convert_dict[item] for item in year_list
             if item in year_convert_dict
         ]
     return DictHelper.dict_from_count_list(year_list)
 def get_all_job_post(url_file, post_file):
     post_info_list = []
     for url in StoreHelper.load_data(url_file, {}):
         web_content = CrawlHelper.get_web_source(url)
         post_info_list.append((url, web_content))
     StoreHelper.store_data(post_info_list, post_file)
Beispiel #11
0
    @staticmethod
    def run_script(vector_list):
        ClusterHelper.plot_clusters(np.array(vector_list), hdbscan.HDBSCAN, (),
                                    {'min_cluster_size': 15})

    @staticmethod
    def mean_shift_cluster(vector_list):
        np_array = np.array(vector_list)
        bandwidth = estimate_bandwidth(np_array, quantile=0.2, n_samples=500)
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(np_array)
        ClusterHelper.print_label(ms.labels_)

    @staticmethod
    def birch_cluster(vector_list, index_list):
        np_array = np.array(vector_list, dtype=float)
        brc = Birch(branching_factor=50, threshold=0.05, compute_labels=True)
        brc.fit(np_array)
        label = brc.predict(np_array)
        ClusterHelper.print_label(label, index_list)


if __name__ == '__main__':
    # _vector_list = StoreHelper.load_data("../data/vectors.dat")
    # ClusterHelper.mean_shift_cluster(_vector_list)
    # ClusterHelper.birch_cluster(_vector_list)
    # ClusterHelper.run_script(_vector_list)
    position_dict = StoreHelper.load_data("../data/position_vector_01.dat", {})
    _vector_list = position_dict.values()
    _index_list = position_dict.keys()
    ClusterHelper.birch_cluster(_vector_list, _index_list)
        fig = plt.figure(1, figsize=(8, 6))
        plt.clf()
        ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
        plt.cla()

        # for label in range(cluster_number):
        #     name = "cluster %i" % label
        #     ax.text3D(X[y == label, 33].mean(),
        #               X[y == label, 99].mean(),
        #               X[y == label, 112].mean(), '',
        #               horizontalalignment='center',
        #               bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))

        # y = np.choose(y, [0, 1, 2]).astype(np.float)
        ax.scatter(X[:, 15], X[:, 17], X[:, 23], c=y)

        ax.w_xaxis.set_ticklabels([])
        ax.w_yaxis.set_ticklabels([])
        ax.w_zaxis.set_ticklabels([])
        ax.set_xlabel('Petal width')
        ax.set_ylabel('Sepal length')
        ax.set_zlabel('Petal length')
        plt.show()


if __name__ == '__main__':
    _vector_list = StoreHelper.load_data("../data/vectors.dat")
    PlotHelper.plot_k_means(_vector_list)

Beispiel #13
0
        return self.phrase_dict

    def convert_2(self, probability_dict):
        year_phase_list = self._get_working_year_words()
        phrase_list = self._remove_conjunction_segment(probability_dict)
        phrase_list.extend(year_phase_list)
        return DictHelper.dict_from_count_list(phrase_list)

    def _remove_conjunction_segment(self, probability_dict):
        phase_list = []
        sentence_list = []
        word_list = SegmentHelper.segment_text(self.raw_position)
        word_group = []
        for word in word_list:
            if word in stopwords.words('english'):
                if len(word_group) > 0:
                    sentence_list.append(' '.join(word_group))
                    word_group = []
            else:
                word_group.append(word)
        if len(word_group) > 0:
            sentence_list.append(' '.join(word_group))
        for sentence in sentence_list:
            phase_list.extend(
                SegmentHelper.phase_segment(probability_dict, sentence, 0.05))
        return phase_list


if __name__ == '__main__':
    year_convert = StoreHelper.load_data('../resource/year_convert.dat', {})
    print year_convert['four year']
Beispiel #14
0
 def build_from_file(file_name="pattern_relationship.dat"):
     return StoreHelper.load_data(file_name)
Beispiel #15
0
        post = soup.find('div', id='jobcopy')
        if post is not None:
            return True, post
        post = soup.find('div', id='bodycol')
        if post is not None:
            return True, post
        post = soup.find('div', id='JobDescription')
        return (True, post) if post is not None else (False, None)

    @staticmethod
    def post_clean(soup_element):
        styles = soup_element.find('style')
        if styles is not None:
            styles.decompose()
        shorts = soup_element.find('div',
                                   {'ng-if': 'featuredJobModel.showAbstract'})
        if shorts is not None:
            shorts.decompose()
        a_link = soup_element.find('a')
        if a_link is not None:
            a_link.decompose()
        return os.linesep.join(
            [s for s in soup_element.text.splitlines() if len(s.strip()) > 0])


if __name__ == '__main__':
    _html_list = StoreHelper.load_data("../data/post/Delaware.dat", [])
    _web_source = _html_list[4][1]
    print(_html_list[4][0])
    # print(_web_source)
    print(HTMLHelper.get_text(_web_source))