Ejemplo n.º 1
0
    def run_script(src_folder,
                   dst_folder,
                   threshold,
                   probability_dict_path=None,
                   generate_dict=True):
        if probability_dict_path is None:
            probability_dict_path = path.join(dst_folder, 'probability.dict')
        if generate_dict is True:
            file_content_list = []
            for i in range(8535):
                input_file = path.join(src_folder, "%04d.dat" % i)
                if StoreHelper.is_file_exist(input_file):
                    file_content_list.append(StoreHelper.read_file(input_file))
                else:
                    print("%s not exist!" % input_file)
            probability_dict = SegmentHelper.generate_probability_dict(
                file_content_list)
            StoreHelper.store_data(probability_dict, probability_dict_path)
            print("Finished generate user dict")
        else:
            probability_dict = StoreHelper.load_data(probability_dict_path, {})
            print("Load dict from file, %i records in dict" %
                  len(probability_dict))

        for i in range(8535):
            input_file = path.join(src_folder, "%04d.dat" % i)
            if StoreHelper.is_file_exist(input_file):
                output_file = path.join(dst_folder, "%04d.dat" % i)
                file_content = StoreHelper.read_file(input_file)
                word_list = []
                for line in file_content.splitlines():
                    word_list.extend(
                        SegmentHelper.phase_segment(probability_dict, line,
                                                    threshold))
                StoreHelper.save_file(os.linesep.join(word_list), output_file)
Ejemplo n.º 2
0
 def generate_token_dict(text_file_list):
     token_file_dict = {}
     for text_file in text_file_list:
         file_name = ntpath.basename(text_file)
         if StoreHelper.is_file_exist(text_file):
             file_content = StoreHelper.read_file(text_file)
             lowers = file_content.lower()
             no_punctuation = lowers.translate(None, string.punctuation)
             token_file_dict[file_name] = no_punctuation
     return token_file_dict
Ejemplo n.º 3
0
 def generate_phase_list():
     probability_dict = StoreHelper.load_data('./data/probability.dic', {})
     print ("Get %i dict from file" % len(probability_dict))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             word_file = "./data/phrase_split/%04d.dat" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context)
             position_dict_list = position_helper.convert_2(probability_dict)
             StoreHelper.save_file("\n".join([str(item) for item in position_dict_list]), word_file)
         else:
             print ("%s not exist!" % text_file)
Ejemplo n.º 4
0
 def convert_position():
     skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
     print ("Get %i words from %s" %(len(skills_dict), "skills dict"))
     discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
     print("Get %i words from %s" % (len(discipline_dict), "discipline_dict"))
     education_dict = StoreHelper.load_data("./resource/education.dat", {})
     print("Get %i words from %s" % (len(education_dict), "education_dict"))
     responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {})
     print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict"))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("working on file %s" % text_file)
             word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, [])
             word_data = "./data/result_dict/%04d.dat" % i
             word_text = "./data/result_dict/%04d.txt" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context, word_list)
             result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat')
             StoreHelper.save_file(result_dict, word_text)
             StoreHelper.store_data(result_dict, word_data)
Ejemplo n.º 5
0
    def compute_tfidf():
        blob_dict = {}
        total_dict = {}
        probability_dict = StoreHelper.load_data('./data/probability.dic', {})
        print("Get %i dict from file" % len(probability_dict))
        for i in range(8535):
            text_file = "./data/clean_post_lemmatize/%04d.dat" % i
            if StoreHelper.is_file_exist(text_file):
                context = StoreHelper.read_file(text_file)
                position_helper = PositionHelper(context)
                blob_dict[i] = position_helper.convert_2(probability_dict)

        tfidf = TFIDF(blob_dict.values())
        for i in range(8535):
            if i in blob_dict:
                output_file = "./data/tfidf-dat/%04d.dat" % i
                print ("Working on %i article!" % i)
                tf_idf_dict = tfidf.get_tf_idf(blob_dict[i])
                DictHelper.merge_dict(total_dict, tf_idf_dict)
                tf_idf_dict = {key: float("%.6f" % value) for key, value in tf_idf_dict.items()}
                StoreHelper.store_data(tf_idf_dict, output_file)
        StoreHelper.store_data(total_dict, "./data/tfidf.dat")
Ejemplo n.º 6
0
 def generate_blob_list():
     blob_list = []
     for i in range(8535):
         phrase_dict_file = "./data/result_dict/%04d.dat" % i
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(phrase_dict_file):
             phrase_dict = StoreHelper.load_data(phrase_dict_file, {})
             text_content = StoreHelper.read_file(text_file)
             word_list = []
             for line in text_content.splitlines():
                 if line.endswith('.'):
                     line = line[:-1]
                 for word in line.split(' '):
                     word_list.append(word)
             for _type in phrase_dict.keys():
                 for words in phrase_dict[_type]:
                     for word in words.split(' '):
                         if word in word_list:
                             word_list.remove(word)
                     word_list.append(words)
             blob_list.append(DictHelper.dict_from_count_list(word_list))
     StoreHelper.store_data(blob_list, './data/blob_list.dat')
     return blob_list
Ejemplo n.º 7
0
    def run_cluster():
        final_vector = [[0 for j in range(310)] for i in range(4980)]
        key_set = StoreHelper.load_data("./resource/feature.dat", {}).keys()
        print("key set length: %i" % len(key_set))

        blob_dict_list = []
        skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
        discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
        education_dict = StoreHelper.load_data("./resource/education.dat", {})
        for i in range(4980):
            text_file = "./data/datascientist/%04d.txt" % i
            context = StoreHelper.read_file(text_file)
            position_helper = PositionHelper(context)
            blob_dict_list.append(position_helper.convert(skills_dict, discipline_dict, education_dict)[4])

        tfidf = TFIDF(blob_dict_list)
        for i in range(4980):
            print("Working on %i article!" % i)
            tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[i])
            # tf_idf_dict = {key: "%.6f" % value for key, value in tf_idf_dict.items()}
            for j in range(310):
                if key_set[j] in tf_idf_dict:
                    final_vector[i][j] = tf_idf_dict[key_set[j]]
        StoreHelper.store_data(final_vector, "./data/vectors.dat")