Exemple #1
0
    def run_script():
        # Step 1, read url from text file
        crawl_dict = Main.parse_file("./resource/url_list")
        print crawl_dict

        # step 2, get job post url from web source
        for location, url_list in crawl_dict.items():
            print("working on %s get job url" % location)
            if StoreHelper.is_file_exist("./data/url/%s.dat" % location):
                print("File already exist, ignore this steps!")
                continue
            url_set = set()
            for url in url_list:
                _list = CrawlHelper.get_all_job_url(url)
                url_set = url_set.union(set(_list))
            print("Totally get %i url for %s\n" % (len(url_set), location))
            if len(url_set) > 0:
                StoreHelper.store_data(list(url_set),
                                       "./data/url/%s.dat" % location)

        # step 3, get job post according to url
        for location, url_list in crawl_dict.items():
            print("working on %s get job post information" % location)
            if StoreHelper.is_file_exist("./data/post/%s.dat" % location):
                print("File already exist, ignore this steps!")
                continue
            CrawlHelper.get_all_job_post("./data/url/%s.dat" % location,
                                         "./data/post/%s.dat" % location)
Exemple #2
0
    def generate_feature_vectors():
        # step 1, generate total dict for each feature
        feature_total_dict = {}
        for i in range(8535):
            result_dict_file = "./data/words_only/data/%04d.dat" % i
            if StoreHelper.is_file_exist(result_dict_file):
                result_dict = StoreHelper.load_data(result_dict_file, {})
                for feature in result_dict:
                    DictHelper.append_dic_key(feature_total_dict, feature, result_dict[feature])

        # step 2, generate feature vector for each feature
        feature_vector_header_dict = {}
        for feature in feature_total_dict:
            feature_list = []
            for words_dict in feature_total_dict[feature]:
                feature_list.extend(words_dict.keys())
            feature_list = list(set(feature_list))
            feature_vector_header_dict[feature] = feature_list
        StoreHelper.store_data(feature_vector_header_dict, 'feature_vector_header.dat')

        # step 3, collect value for each feature vector
        feature_vector_dict = {}
        for feature in feature_vector_header_dict:
            feature_dict = {}
            feature_list = feature_vector_header_dict[feature]
            for i in range(8535):
                result_dict_file = "./data/words_only/data/%04d.dat" % i
                if StoreHelper.is_file_exist(result_dict_file):
                    result_dict = StoreHelper.load_data(result_dict_file, {})
                    feature_dict[i] = [result_dict[feature][words] if words in result_dict[feature] else 0 for words in feature_list]
            feature_vector_dict[feature] = feature_dict
        # print (feature_vector_dict.keys())
        # print (str([len(value[1]) for value in feature_vector_dict.values()]))
        StoreHelper.store_data(feature_vector_dict, 'feature_vector.dat')
        StoreHelper.save_file(feature_vector_dict, 'feature_vector.txt')
    def run_script(src_folder,
                   dst_folder,
                   threshold,
                   probability_dict_path=None,
                   generate_dict=True):
        if probability_dict_path is None:
            probability_dict_path = path.join(dst_folder, 'probability.dict')
        if generate_dict is True:
            file_content_list = []
            for i in range(8535):
                input_file = path.join(src_folder, "%04d.dat" % i)
                if StoreHelper.is_file_exist(input_file):
                    file_content_list.append(StoreHelper.read_file(input_file))
                else:
                    print("%s not exist!" % input_file)
            probability_dict = SegmentHelper.generate_probability_dict(
                file_content_list)
            StoreHelper.store_data(probability_dict, probability_dict_path)
            print("Finished generate user dict")
        else:
            probability_dict = StoreHelper.load_data(probability_dict_path, {})
            print("Load dict from file, %i records in dict" %
                  len(probability_dict))

        for i in range(8535):
            input_file = path.join(src_folder, "%04d.dat" % i)
            if StoreHelper.is_file_exist(input_file):
                output_file = path.join(dst_folder, "%04d.dat" % i)
                file_content = StoreHelper.read_file(input_file)
                word_list = []
                for line in file_content.splitlines():
                    word_list.extend(
                        SegmentHelper.phase_segment(probability_dict, line,
                                                    threshold))
                StoreHelper.save_file(os.linesep.join(word_list), output_file)
Exemple #4
0
 def get_only_words_in_5():
     for i in range(8535):
         result_dict = {}
         words_dict_file = "./data/result_dict/%04d.dat" % i
         tfidf_dict_file = "./data/tfidf-dat/%04d.dat" % i
         if StoreHelper.is_file_exist(tfidf_dict_file):
             tfidf_dict = StoreHelper.load_data(tfidf_dict_file, {})
             words_dict = StoreHelper.load_data(words_dict_file, {})
             for _type in words_dict.keys():
                 result_dict[_type] = {}
                 for word in words_dict[_type]:
                     if word in tfidf_dict:
                         result_dict[_type][word] = tfidf_dict[word]
                     else:
                         normal_word = SegmentHelper.normalize(word)
                         if normal_word in tfidf_dict:
                             print ("Saved by normalize for %s" % normal_word)
                             result_dict[_type][word] = tfidf_dict[normal_word]
                         else:
                             print ("%s not found in %s" % (word, tfidf_dict_file))
             # for _type in result_dict.keys():
             #     result_dict[_type] = DictHelper.get_sorted_list(result_dict[_type])
             # print (result_dict.keys())
             StoreHelper.store_data(result_dict, "./data/words_only/data/%04d.dat" % i)
             StoreHelper.save_file(result_dict, "./data/words_only/text/%04d.txt" % i)
Exemple #5
0
 def generate_token_dict(text_file_list):
     token_file_dict = {}
     for text_file in text_file_list:
         file_name = ntpath.basename(text_file)
         if StoreHelper.is_file_exist(text_file):
             file_content = StoreHelper.read_file(text_file)
             lowers = file_content.lower()
             no_punctuation = lowers.translate(None, string.punctuation)
             token_file_dict[file_name] = no_punctuation
     return token_file_dict
Exemple #6
0
 def generate_phase_list():
     probability_dict = StoreHelper.load_data('./data/probability.dic', {})
     print ("Get %i dict from file" % len(probability_dict))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             word_file = "./data/phrase_split/%04d.dat" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context)
             position_dict_list = position_helper.convert_2(probability_dict)
             StoreHelper.save_file("\n".join([str(item) for item in position_dict_list]), word_file)
         else:
             print ("%s not exist!" % text_file)
Exemple #7
0
    def get_post_vector():
        year_list = []
        education_list = []
        major_list = []
        skill_list = []
        responsibility_list = []
        position_tfidf_dict = {}
        for i in range(8535):
            phrase_dict_file = "./data/words_only/data/%04d.dat" % i
            if StoreHelper.is_file_exist(phrase_dict_file):
                phrase_dict = StoreHelper.load_data(phrase_dict_file, {})
                position_tfidf_dict[i] = phrase_dict
                if 'working-year' in phrase_dict:
                    year_list.extend(phrase_dict['working-year'].keys())
                if 'education' in phrase_dict:
                    education_list.extend(phrase_dict['education'].keys())
                if 'major' in phrase_dict:
                    major_list.extend(phrase_dict['major'].keys())
                if 'skills' in phrase_dict:
                    skill_list.extend(phrase_dict['skills'].keys())
                if 'responsibility' in phrase_dict:
                    responsibility_list.extend(phrase_dict['responsibility'].keys())
        year_list = list(set(year_list))
        print ("year list count: %d" % len(year_list))
        education_list = list(set(education_list))
        print("education_list list count: %d" % len(education_list))
        major_list = list(set(major_list))
        print("major_list list count: %d" % len(major_list))
        skill_list = list(set(skill_list))
        print("skill_list list count: %d" % len(skill_list))
        responsibility_list = list(set(responsibility_list))
        print("responsibility_list list count: %d" % len(responsibility_list))
        StoreHelper.store_data([year_list, education_list, major_list, skill_list, responsibility_list], 'vector.dat')

        position_vectors = {}
        for i in range(8535):
            if i in position_tfidf_dict:
                position = []
                for word in year_list:
                    position.append(0 if word not in position_tfidf_dict[i]['working-year'] else position_tfidf_dict[i]['working-year'][word])
                for word in education_list:
                    position.append(0 if word not in position_tfidf_dict[i]['education'] else position_tfidf_dict[i]['education'][word])
                for word in major_list:
                    position.append(0 if word not in position_tfidf_dict[i]['major'] else position_tfidf_dict[i]['major'][word])
                for word in skill_list:
                    position.append(0 if word not in position_tfidf_dict[i]['skills'] else position_tfidf_dict[i]['skills'][word])
                for word in responsibility_list:
                    position.append(0 if word not in position_tfidf_dict[i]['responsibility'] else position_tfidf_dict[i]['responsibility'][word])
                position_vectors[i] = position
        StoreHelper.store_data(position_vectors, './data/position_vector_01.dat')
Exemple #8
0
 def get_tfidf():
     blob_dict_list = Main.generate_blob_list()
     profile_dict_list = StoreHelper.load_data('./resource/merged_profile.dat', [])
     blob_dict_list.extend(profile_dict_list)
     tfidf = TFIDF(blob_dict_list)
     j = 0
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print("Working on %s article!" % text_file)
             tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[j])
             StoreHelper.store_data(tf_idf_dict, "./data/tfidf-dat/%04d.dat" % i)
             StoreHelper.save_file(DictHelper.get_sorted_list(tf_idf_dict), "./data/tfidf/%04d.dat" % i)
             j += 1
Exemple #9
0
    def run_script():
        # Step 1, read url from text file
        crawl_dict = StoreHelper.parse_file("./resource/url_list")

        # step 2
        total_dict = {}
        for location, url_list in crawl_dict.items():
            file_name = "./data/post/%s.dat" % location
            print (file_name)
            if StoreHelper.is_file_exist(file_name):
                total_dict.update(Main.get_frequency_from_file(file_name))

        # sort dict
        total_dict = sorted(total_dict.items(), key=operator.itemgetter(1), reverse=True)
        StoreHelper.store_data(total_dict, "word_frequency.dat")
Exemple #10
0
 def convert_position():
     skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
     print ("Get %i words from %s" %(len(skills_dict), "skills dict"))
     discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
     print("Get %i words from %s" % (len(discipline_dict), "discipline_dict"))
     education_dict = StoreHelper.load_data("./resource/education.dat", {})
     print("Get %i words from %s" % (len(education_dict), "education_dict"))
     responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {})
     print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict"))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("working on file %s" % text_file)
             word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, [])
             word_data = "./data/result_dict/%04d.dat" % i
             word_text = "./data/result_dict/%04d.txt" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context, word_list)
             result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat')
             StoreHelper.save_file(result_dict, word_text)
             StoreHelper.store_data(result_dict, word_data)
Exemple #11
0
    def compute_tfidf():
        blob_dict = {}
        total_dict = {}
        probability_dict = StoreHelper.load_data('./data/probability.dic', {})
        print("Get %i dict from file" % len(probability_dict))
        for i in range(8535):
            text_file = "./data/clean_post_lemmatize/%04d.dat" % i
            if StoreHelper.is_file_exist(text_file):
                context = StoreHelper.read_file(text_file)
                position_helper = PositionHelper(context)
                blob_dict[i] = position_helper.convert_2(probability_dict)

        tfidf = TFIDF(blob_dict.values())
        for i in range(8535):
            if i in blob_dict:
                output_file = "./data/tfidf-dat/%04d.dat" % i
                print ("Working on %i article!" % i)
                tf_idf_dict = tfidf.get_tf_idf(blob_dict[i])
                DictHelper.merge_dict(total_dict, tf_idf_dict)
                tf_idf_dict = {key: float("%.6f" % value) for key, value in tf_idf_dict.items()}
                StoreHelper.store_data(tf_idf_dict, output_file)
        StoreHelper.store_data(total_dict, "./data/tfidf.dat")
Exemple #12
0
 def generate_blob_list():
     blob_list = []
     for i in range(8535):
         phrase_dict_file = "./data/result_dict/%04d.dat" % i
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(phrase_dict_file):
             phrase_dict = StoreHelper.load_data(phrase_dict_file, {})
             text_content = StoreHelper.read_file(text_file)
             word_list = []
             for line in text_content.splitlines():
                 if line.endswith('.'):
                     line = line[:-1]
                 for word in line.split(' '):
                     word_list.append(word)
             for _type in phrase_dict.keys():
                 for words in phrase_dict[_type]:
                     for word in words.split(' '):
                         if word in word_list:
                             word_list.remove(word)
                     word_list.append(words)
             blob_list.append(DictHelper.dict_from_count_list(word_list))
     StoreHelper.store_data(blob_list, './data/blob_list.dat')
     return blob_list