def generate_feature_vector(feature, profile_list, position_list):
        print("Totally have %d profile" % len(profile_list))
        print("Totally have %d position post" % len(position_list))

        # step 1, get fully set of this feature
        profile_phrase_list = []
        for profile in profile_list:
            profile_phrase_list.extend(profile[feature])
        profile_phrase_list = list(set(profile_phrase_list))
        print("Totally get %d words in profile for feature %s" %
              (len(profile_phrase_list), feature))
        position_phrase_list = []
        for position in position_list:
            if feature in position:
                position_phrase_list.extend(position[feature])
        position_phrase_list = list(set(position_phrase_list))
        print("Totally get %d words in position for feature %s" %
              (len(position_phrase_list), feature))

        # step 2, generate full sum
        phrase_list = list(
            set(profile_phrase_list).union(set(position_phrase_list)))
        print("Totally get %d words in all for feature %s" %
              (len(phrase_list), feature))
        StoreHelper.store_data(phrase_list,
                               "position_profile_%s.dat" % feature)
Exemple #2
0
    def get_company_rank():
        company_rank_dict = {}
        us_list_company_data_file = './resource/company_list.dat'
        fortune_500_company_data_file = './resource/fortune-500.dat'
        posting_company_data_file = 'company_name.dic'

        posting_company_dict = StoreHelper.load_data(posting_company_data_file,
                                                     {})
        us_list_company_dict = StoreHelper.load_data(us_list_company_data_file,
                                                     {})
        fortune_500_company_dict = StoreHelper.load_data(
            fortune_500_company_data_file, {})

        for company_name in posting_company_dict.values():
            rank = 3  # default normal company
            for company in fortune_500_company_dict:
                if TextHelper.word_in_phrase(company_name, company):
                    rank = 1
            if rank == 3:
                for company in us_list_company_dict:
                    if TextHelper.word_in_phrase(company_name, company):
                        rank = 2
            company_rank_dict[company_name] = rank
        StoreHelper.store_data(company_rank_dict, 'company_rank.dic')
        print(
            DictHelper.get_sorted_list(company_rank_dict,
                                       sorted_by_key=False,
                                       reverse=False))
Exemple #3
0
    def generate_feature_vectors():
        # step 1, generate total dict for each feature
        feature_total_dict = {}
        for i in range(8535):
            result_dict_file = "./data/words_only/data/%04d.dat" % i
            if StoreHelper.is_file_exist(result_dict_file):
                result_dict = StoreHelper.load_data(result_dict_file, {})
                for feature in result_dict:
                    DictHelper.append_dic_key(feature_total_dict, feature, result_dict[feature])

        # step 2, generate feature vector for each feature
        feature_vector_header_dict = {}
        for feature in feature_total_dict:
            feature_list = []
            for words_dict in feature_total_dict[feature]:
                feature_list.extend(words_dict.keys())
            feature_list = list(set(feature_list))
            feature_vector_header_dict[feature] = feature_list
        StoreHelper.store_data(feature_vector_header_dict, 'feature_vector_header.dat')

        # step 3, collect value for each feature vector
        feature_vector_dict = {}
        for feature in feature_vector_header_dict:
            feature_dict = {}
            feature_list = feature_vector_header_dict[feature]
            for i in range(8535):
                result_dict_file = "./data/words_only/data/%04d.dat" % i
                if StoreHelper.is_file_exist(result_dict_file):
                    result_dict = StoreHelper.load_data(result_dict_file, {})
                    feature_dict[i] = [result_dict[feature][words] if words in result_dict[feature] else 0 for words in feature_list]
            feature_vector_dict[feature] = feature_dict
        # print (feature_vector_dict.keys())
        # print (str([len(value[1]) for value in feature_vector_dict.values()]))
        StoreHelper.store_data(feature_vector_dict, 'feature_vector.dat')
        StoreHelper.save_file(feature_vector_dict, 'feature_vector.txt')
Exemple #4
0
 def generate_company_list():
     company_name_dict = StoreHelper.load_data('company_name.dic', {})
     company_dict = {}
     for company_name in company_name_dict.values():
         DictHelper.increase_dic_key(company_dict, company_name)
     print ("Totally %d company" % len(company_dict.keys()))
     StoreHelper.save_file(DictHelper.get_sorted_list(company_dict), "company_dict.txt")
Exemple #5
0
    def run_script():
        # Step 1, read url from text file
        crawl_dict = Main.parse_file("./resource/url_list")
        print crawl_dict

        # step 2, get job post url from web source
        for location, url_list in crawl_dict.items():
            print("working on %s get job url" % location)
            if StoreHelper.is_file_exist("./data/url/%s.dat" % location):
                print("File already exist, ignore this steps!")
                continue
            url_set = set()
            for url in url_list:
                _list = CrawlHelper.get_all_job_url(url)
                url_set = url_set.union(set(_list))
            print("Totally get %i url for %s\n" % (len(url_set), location))
            if len(url_set) > 0:
                StoreHelper.store_data(list(url_set),
                                       "./data/url/%s.dat" % location)

        # step 3, get job post according to url
        for location, url_list in crawl_dict.items():
            print("working on %s get job post information" % location)
            if StoreHelper.is_file_exist("./data/post/%s.dat" % location):
                print("File already exist, ignore this steps!")
                continue
            CrawlHelper.get_all_job_post("./data/url/%s.dat" % location,
                                         "./data/post/%s.dat" % location)
Exemple #6
0
 def extract_company_name():
     crawl_dict = Main.parse_file("./resource/url_list")
     company_name_dict = {}
     total_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         print("Find %i record in %s" % (len(positions), file_name))
         for url, position in positions:
             print("work on position: %4d" % total_numbers)
             company_list = HTMLHelper.get_company_name(position)
             if len(company_list) == 0:
                 print(
                     "Can not found company name in position %d url is %s" %
                     (total_numbers, url))
             elif len(company_list) == 1:
                 company_name_dict[total_numbers] = SegmentHelper.normalize(
                     company_list[0])
                 print("Found company name %s for position %d" %
                       (company_list[0], total_numbers))
             else:
                 company_name_dict[total_numbers] = SegmentHelper.normalize(
                     company_list[0])
                 print(
                     "Found multi company name %s for position %d (choose the first one)"
                     % (str(company_list), total_numbers))
             total_numbers += 1
     StoreHelper.save_file(company_name_dict, "company_name.txt")
     StoreHelper.store_data(company_name_dict, "company_name.dic")
     print("In summary, total downloaded %i records!" % total_numbers)
Exemple #7
0
 def generate_token_dict(text_file_list):
     token_file_dict = {}
     for text_file in text_file_list:
         file_name = ntpath.basename(text_file)
         if StoreHelper.is_file_exist(text_file):
             file_content = StoreHelper.read_file(text_file)
             lowers = file_content.lower()
             no_punctuation = lowers.translate(None, string.punctuation)
             token_file_dict[file_name] = no_punctuation
     return token_file_dict
    def run_script(src_folder,
                   dst_folder,
                   threshold,
                   probability_dict_path=None,
                   generate_dict=True):
        if probability_dict_path is None:
            probability_dict_path = path.join(dst_folder, 'probability.dict')
        if generate_dict is True:
            file_content_list = []
            for i in range(8535):
                input_file = path.join(src_folder, "%04d.dat" % i)
                if StoreHelper.is_file_exist(input_file):
                    file_content_list.append(StoreHelper.read_file(input_file))
                else:
                    print("%s not exist!" % input_file)
            probability_dict = SegmentHelper.generate_probability_dict(
                file_content_list)
            StoreHelper.store_data(probability_dict, probability_dict_path)
            print("Finished generate user dict")
        else:
            probability_dict = StoreHelper.load_data(probability_dict_path, {})
            print("Load dict from file, %i records in dict" %
                  len(probability_dict))

        for i in range(8535):
            input_file = path.join(src_folder, "%04d.dat" % i)
            if StoreHelper.is_file_exist(input_file):
                output_file = path.join(dst_folder, "%04d.dat" % i)
                file_content = StoreHelper.read_file(input_file)
                word_list = []
                for line in file_content.splitlines():
                    word_list.extend(
                        SegmentHelper.phase_segment(probability_dict, line,
                                                    threshold))
                StoreHelper.save_file(os.linesep.join(word_list), output_file)
Exemple #9
0
 def generate_phase_list():
     probability_dict = StoreHelper.load_data('./data/probability.dic', {})
     print ("Get %i dict from file" % len(probability_dict))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             word_file = "./data/phrase_split/%04d.dat" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context)
             position_dict_list = position_helper.convert_2(probability_dict)
             StoreHelper.save_file("\n".join([str(item) for item in position_dict_list]), word_file)
         else:
             print ("%s not exist!" % text_file)
Exemple #10
0
    def run_script():
        # Step 1, read url from text file
        crawl_dict = StoreHelper.parse_file("./resource/url_list")

        # step 2
        total_dict = {}
        for location, url_list in crawl_dict.items():
            file_name = "./data/post/%s.dat" % location
            print (file_name)
            if StoreHelper.is_file_exist(file_name):
                total_dict.update(Main.get_frequency_from_file(file_name))

        # sort dict
        total_dict = sorted(total_dict.items(), key=operator.itemgetter(1), reverse=True)
        StoreHelper.store_data(total_dict, "word_frequency.dat")
Exemple #11
0
    def compute_center_point(exclude_post=[1404, 3721, 4337, 2085, 7246], select_feature=None):
        position_vectors = StoreHelper.load_data('./data/position_vector_01.dat', {})
        for index in exclude_post:
            if index in position_vectors:
                del position_vectors[index]
        vector_list = StoreHelper.load_data('vector.dat', [])

        vector_dict = {'working-year': vector_list[0], 'education': vector_list[1], 'major': vector_list[2],
                       'skills': vector_list[3], 'responsibility': vector_list[4]}
        vector_length = [len(item_list) for item_list in vector_list]
        vector_length_dict = {'working-year': (0, sum(vector_length[:1])),
                              'education': (sum(vector_length[:1]), sum(vector_length[:2])),
                              'major': (sum(vector_length[:2]), sum(vector_length[:3])),
                              'skills': (sum(vector_length[:3]), sum(vector_length[:4])),
                              'responsibility': (sum(vector_length[:4]), sum(vector_length[:5]))}

        csv_index = position_vectors.keys()

        if select_feature is None:
            csv_column = []
            for item_list in vector_list:
                csv_column.extend(item_list)
            csv_data = position_vectors.values()
            csv_file = 'center_point.csv'
        else:
            start, end = vector_length_dict[select_feature]
            csv_column = vector_dict[select_feature]
            csv_data = [position[start: end] for position in position_vectors.values()]
            csv_file = '%s_center_point.csv' % select_feature
        center_point = [0 for i in range(len(csv_column))]
        for position in csv_data:
            for i in range(len(center_point)):
                center_point[i] += position[i]
        center_point = [value / len(position_vectors) for value in center_point]
        print ("Center point: %s" % str(center_point))
        StoreHelper.store_data(center_point, 'center_point.dat')
        center_dict = {csv_column[i]: center_point[i] for i in range(len(csv_column))}
        print (center_dict)
        center_list = DictHelper.get_sorted_list(center_dict, sorted_by_key=False)
        print (center_list)
        Main.write_list_to_csv(csv_file, [pair[0] for pair in center_list], [[pair[1] for pair in center_list]])

        max_distance = (0, 0)
        for i in range(len(csv_data)):
            distance = Main.compute_distance(center_point, csv_data[i])
            if distance > max_distance[1]:
                max_distance = (csv_index[i], distance)
        print("max distance: %s" % str(max_distance))
Exemple #12
0
 def get_frequency_from_file(file_name):
     _html_list = StoreHelper.load_data(file_name, [])
     _dict = {}
     for _url, _web_source in _html_list:
         clean_content = HTMLHelper.remove_tag(_web_source)
         _dict.update(WordFrequency.get_frequency_dict(clean_content))
     return _dict
Exemple #13
0
 def generate_all_text():
     crawl_dict = StoreHelper.parse_file("./resource/url_list")
     count_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         for url, web_source in positions:
             if 'data scientist' in web_source.lower():
                 text_content = HTMLHelper.get_text(web_source)
                 # text_dict = WordFrequency.get_frequency_dict(text_content)
                 # output = [str(item) for item in text_dict]
                 # output.extend([" ", text_content, " ",  url])
                 StoreHelper.save_file(text_content, "./data/datascientist/%04d.txt" % count_numbers)
                 count_numbers += 1
             else:
                 print ("Data Scientist not found in %s!" % url)
Exemple #14
0
    def find_position_candidate(position_index,
                                threshold,
                                feature_weight_dict=None):
        if feature_weight_dict is None:
            feature_weight_dict = {
                'years': 0.25,
                'education': 0.25,
                'major': 0.25,
                'skills': 0.25
            }
        profile_vector = StoreHelper.load_data('profile_vector_common.dat', [])
        position_vector = StoreHelper.load_data('position_vector_common.dat',
                                                [])
        index_dict = StoreHelper.load_data('index_dict.dat', {})

        if position_index is None:
            max_distance = []
            count = 0
            total_account = len(position_vector)
            for position in position_vector[:30]:
                print("total position %d now is %d" % (total_account, count))
                count += 1
                distance_list = [
                    Main.generate_match_ratio(position, profile,
                                              feature_weight_dict)
                    for profile in profile_vector
                ]
                max_distance.append(max(distance_list))
            print(max_distance)
            print("max distance %f" % max(max_distance))
            print("Totally %d profile meet requirements" % sum([
                1 if distance > threshold else 0 for distance in max_distance
            ]))
        else:
            position = position_vector[index_dict[position_index]]
            print("Position: %s" % str(position))
            distance_list = [
                Main.generate_match_ratio(position, profile,
                                          feature_weight_dict)
                for profile in profile_vector
            ]
            print(distance_list)
            print("max distance %f" % max(distance_list))
            print("Totally %d profile meet requirements" % sum([
                1 if distance > threshold else 0 for distance in distance_list
            ]))
Exemple #15
0
    def generate_csv_file(value_with_01, file_name='feature', select_feature=None):
        vector_list = StoreHelper.load_data('vector.dat', [])
        # Generate csv column
        csv_column = ['cluster_number', 'position_number']
        if select_feature is None:
            for item_list in vector_list:
                for item in item_list:
                    csv_column.append(item)
        else:
            vector_dict = {'working-year': vector_list[0], 'education': vector_list[1], 'major': vector_list[2],
                           'skills': vector_list[3], 'responsibility': vector_list[4]}
            vector_length = [len(item_list) for item_list in vector_list]
            vector_length_dict = {'working-year': (0, sum(vector_length[:1])),
                                  'education': (sum(vector_length[:1]), sum(vector_length[:2])),
                                  'major': (sum(vector_length[:2]), sum(vector_length[:3])),
                                  'skills': (sum(vector_length[:3]), sum(vector_length[:4])),
                                  'responsibility': (sum(vector_length[:4]), sum(vector_length[:5]))}
            start, end = vector_length_dict[select_feature]
            csv_column.extend(vector_dict[select_feature])

        # Generate data
        data_dict = StoreHelper.load_data('./data/position_vector_01.dat', {})
        print ("data_dict row=%d, column=%d" % (len(data_dict), len(data_dict[1])))
        tag_dict = StoreHelper.load_data('position_tag.dat', {})

        # tag dict record {0: [1,4], 2: [2,3]}
        tag_dict = {key: value for key, value in tag_dict.items() if len(value) > 50}
        print ("Tag dict keys after filter: %s" % (str(tag_dict.keys())))
        for key in tag_dict:
            data_column = []
            for number in tag_dict[key]:
                row_value = [int(key), number]
                if select_feature is not None:
                    row_value.extend(data_dict[number][start: end])
                else:
                    row_value.extend(data_dict[number])
                data_column.append(row_value)
            print("data_column row=%d, column=%d" % (len(data_column), len(data_column[1])))
            if select_feature is not None:
                show_vector_list = [vector_dict[select_feature]]
            else:
                show_vector_list = vector_list
            sort_csv_column, sort_data_column = Main.sort_column(csv_column, data_column, show_vector_list, 2, value_with_01)
            print("sort_data_column row=%d, column=%d" % (len(sort_data_column), len(sort_data_column[1])))
            Main.write_list_to_csv('%s_class_%d.csv' % (file_name, key), sort_csv_column, sort_data_column)
Exemple #16
0
 def get_only_words_in_5():
     for i in range(8535):
         result_dict = {}
         words_dict_file = "./data/result_dict/%04d.dat" % i
         tfidf_dict_file = "./data/tfidf-dat/%04d.dat" % i
         if StoreHelper.is_file_exist(tfidf_dict_file):
             tfidf_dict = StoreHelper.load_data(tfidf_dict_file, {})
             words_dict = StoreHelper.load_data(words_dict_file, {})
             for _type in words_dict.keys():
                 result_dict[_type] = {}
                 for word in words_dict[_type]:
                     if word in tfidf_dict:
                         result_dict[_type][word] = tfidf_dict[word]
                     else:
                         normal_word = SegmentHelper.normalize(word)
                         if normal_word in tfidf_dict:
                             print ("Saved by normalize for %s" % normal_word)
                             result_dict[_type][word] = tfidf_dict[normal_word]
                         else:
                             print ("%s not found in %s" % (word, tfidf_dict_file))
             # for _type in result_dict.keys():
             #     result_dict[_type] = DictHelper.get_sorted_list(result_dict[_type])
             # print (result_dict.keys())
             StoreHelper.store_data(result_dict, "./data/words_only/data/%04d.dat" % i)
             StoreHelper.save_file(result_dict, "./data/words_only/text/%04d.txt" % i)
Exemple #17
0
 def view_downloaded_data():
     crawl_dict = Main.parse_file("./resource/url_list")
     total_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         print("Find %i record in %s" % (len(positions), file_name))
         total_numbers += len(positions)
     print("In summary, total downloaded %i records!" % total_numbers)
Exemple #18
0
 def convert_skill_100():
     skills_list = StoreHelper.load_data("position_profile_skills.dat", [])
     skills_convert_dict = {}
     prefered_list = [
         "analysis", "python", "r", "analytics", "machine learning", "sql",
         "modeling", "big data", "hadoop", "java", "statistics",
         "mathematics", "sas", "data mining", "processing", "spark",
         "security", "visualization", "testing", "c", "access",
         "optimization", "hive", "integration", "excel", "tableau",
         "scripting", "development", "scala", "matlab", "linux", "nosql",
         "management", "intelligence", "aws", "regression", "spss", "pig",
         "clustering", "saas", "oracle", "go", "physics", "classification",
         "javascript", "operations research", "mapreduce", "forecasting",
         "engineering", "powerpoint", "automation", "b2b", "segmentation",
         "dashboard", "computing", "deep learning", "defense", "unix",
         "hbase", "d3", "perl", "algorithms", "advertising", "word",
         "communication", "simulation", "data collection", "hardware",
         "command", "apache", "troubleshooting", "ruby", "mongodb", "mysql",
         "probability", "hdfs", "econometrics", "data warehousing", "scrum",
         "cassandra", "databases", "git", "cluster", "statistical software",
         "manufacturing", "improvement", "pricing", "data architecture",
         "critical thinking", "html", "design", "strategy", "fraud",
         "microsoft office", "teradata", "quality assurance",
         "data integration", "experimentation", "customer service",
         "bioinformatics"
     ]
     for key in prefered_list:
         match = False
         if key not in skills_list:
             for skill in skills_list:
                 if key in skill:
                     match = True
                     if skill not in skills_convert_dict:
                         skills_convert_dict[skill] = key
                     else:
                         print("%s key duplicate" % skill)
                     break
         else:
             match = True
             skills_convert_dict[key] = key
         if not match:
             print(key)
     StoreHelper.store_data(skills_convert_dict, 'skills_convert_dict.dat')
     print(len(skills_convert_dict))
Exemple #19
0
 def test_average_skills_per_post():
     position_list = StoreHelper.load_data('position_list.dat', [])
     skill_number_list = [
         len(post['skills']) if 'skills' in post else 0
         for post in position_list
     ]
     print(skill_number_list)
     print("total position number %d, average %f skills per post!" %
           (len(position_list),
            sum(skill_number_list) * 1.0 / len(position_list)))
Exemple #20
0
 def extract_download_data():
     crawl_dict = Main.parse_file("./resource/url_list")
     total_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         print("Find %i record in %s" % (len(positions), file_name))
         for url, position in positions:
             # step 1, store origin file
             # output1 = "./data/text/%04d.html" % total_numbers
             # StoreHelper.save_file(position, output1)
             output2 = "./data/clean_post_without_header/%04d.dat" % total_numbers
             print("work on position: %4d" % total_numbers)
             status, content = HTMLHelper.get_post(position)
             if status is False:
                 print("Error happen on extract %s" % url)
                 # StoreHelper.save_file(position, output2)
             else:
                 StoreHelper.save_file(HTMLHelper.post_clean(content),
                                       output2)
             total_numbers += 1
     print("In summary, total downloaded %i records!" % total_numbers)
Exemple #21
0
 def generate_blob_list():
     blob_list = []
     for i in range(8535):
         phrase_dict_file = "./data/result_dict/%04d.dat" % i
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(phrase_dict_file):
             phrase_dict = StoreHelper.load_data(phrase_dict_file, {})
             text_content = StoreHelper.read_file(text_file)
             word_list = []
             for line in text_content.splitlines():
                 if line.endswith('.'):
                     line = line[:-1]
                 for word in line.split(' '):
                     word_list.append(word)
             for _type in phrase_dict.keys():
                 for words in phrase_dict[_type]:
                     for word in words.split(' '):
                         if word in word_list:
                             word_list.remove(word)
                     word_list.append(words)
             blob_list.append(DictHelper.dict_from_count_list(word_list))
     StoreHelper.store_data(blob_list, './data/blob_list.dat')
     return blob_list
Exemple #22
0
    def get_post_vector():
        year_list = []
        education_list = []
        major_list = []
        skill_list = []
        responsibility_list = []
        position_tfidf_dict = {}
        for i in range(8535):
            phrase_dict_file = "./data/words_only/data/%04d.dat" % i
            if StoreHelper.is_file_exist(phrase_dict_file):
                phrase_dict = StoreHelper.load_data(phrase_dict_file, {})
                position_tfidf_dict[i] = phrase_dict
                if 'working-year' in phrase_dict:
                    year_list.extend(phrase_dict['working-year'].keys())
                if 'education' in phrase_dict:
                    education_list.extend(phrase_dict['education'].keys())
                if 'major' in phrase_dict:
                    major_list.extend(phrase_dict['major'].keys())
                if 'skills' in phrase_dict:
                    skill_list.extend(phrase_dict['skills'].keys())
                if 'responsibility' in phrase_dict:
                    responsibility_list.extend(phrase_dict['responsibility'].keys())
        year_list = list(set(year_list))
        print ("year list count: %d" % len(year_list))
        education_list = list(set(education_list))
        print("education_list list count: %d" % len(education_list))
        major_list = list(set(major_list))
        print("major_list list count: %d" % len(major_list))
        skill_list = list(set(skill_list))
        print("skill_list list count: %d" % len(skill_list))
        responsibility_list = list(set(responsibility_list))
        print("responsibility_list list count: %d" % len(responsibility_list))
        StoreHelper.store_data([year_list, education_list, major_list, skill_list, responsibility_list], 'vector.dat')

        position_vectors = {}
        for i in range(8535):
            if i in position_tfidf_dict:
                position = []
                for word in year_list:
                    position.append(0 if word not in position_tfidf_dict[i]['working-year'] else position_tfidf_dict[i]['working-year'][word])
                for word in education_list:
                    position.append(0 if word not in position_tfidf_dict[i]['education'] else position_tfidf_dict[i]['education'][word])
                for word in major_list:
                    position.append(0 if word not in position_tfidf_dict[i]['major'] else position_tfidf_dict[i]['major'][word])
                for word in skill_list:
                    position.append(0 if word not in position_tfidf_dict[i]['skills'] else position_tfidf_dict[i]['skills'][word])
                for word in responsibility_list:
                    position.append(0 if word not in position_tfidf_dict[i]['responsibility'] else position_tfidf_dict[i]['responsibility'][word])
                position_vectors[i] = position
        StoreHelper.store_data(position_vectors, './data/position_vector_01.dat')
Exemple #23
0
 def get_tfidf():
     blob_dict_list = Main.generate_blob_list()
     profile_dict_list = StoreHelper.load_data('./resource/merged_profile.dat', [])
     blob_dict_list.extend(profile_dict_list)
     tfidf = TFIDF(blob_dict_list)
     j = 0
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print("Working on %s article!" % text_file)
             tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[j])
             StoreHelper.store_data(tf_idf_dict, "./data/tfidf-dat/%04d.dat" % i)
             StoreHelper.save_file(DictHelper.get_sorted_list(tf_idf_dict), "./data/tfidf/%04d.dat" % i)
             j += 1
Exemple #24
0
 def cross():
     profile_list = StoreHelper.load_data('./resource/convert_profile.dat',
                                          [])
     position_dict = StoreHelper.load_data("./data/position_vector_01.dat",
                                           {})
     print(len(position_dict.values()[0]))
     vector_list = StoreHelper.load_data('vector.dat', [])
     print(sum([len(value) for value in vector_list]))
     vector_dict = {
         'years': vector_list[0],
         'education': vector_list[1],
         'major': vector_list[2],
         'skills': vector_list[3],
         'responsibility': vector_list[4]
     }
     vector_length = [len(item_list) for item_list in vector_list]
     vector_length_dict = {
         'years': (0, sum(vector_length[:1])),
         'education': (sum(vector_length[:1]), sum(vector_length[:2])),
         'major': (sum(vector_length[:2]), sum(vector_length[:3])),
         'skills': (sum(vector_length[:3]), sum(vector_length[:4])),
         'responsibility': (sum(vector_length[:4]), sum(vector_length[:5]))
     }
     position_list = []
     index_dict = {}
     count = 0
     for index, position in position_dict.items():
         index_dict[count] = index
         count += 1
         position_phrase_dict = {}
         for feature in vector_dict:
             start, end = vector_length_dict[feature]
             for i in range(len(vector_dict[feature])):
                 if position[start + i] > 0:
                     DictHelper.append_dic_key(position_phrase_dict,
                                               feature,
                                               vector_dict[feature][i])
         position_list.append(position_phrase_dict)
     StoreHelper.store_data(index_dict, 'index_dict.dat')
     StoreHelper.store_data(position_list, 'position_list.dat')
     for feature in ['years', 'education', 'major', 'skills']:
         Main.generate_feature_vector(feature, profile_list, position_list)
Exemple #25
0
    def compute_tfidf():
        blob_dict = {}
        total_dict = {}
        probability_dict = StoreHelper.load_data('./data/probability.dic', {})
        print("Get %i dict from file" % len(probability_dict))
        for i in range(8535):
            text_file = "./data/clean_post_lemmatize/%04d.dat" % i
            if StoreHelper.is_file_exist(text_file):
                context = StoreHelper.read_file(text_file)
                position_helper = PositionHelper(context)
                blob_dict[i] = position_helper.convert_2(probability_dict)

        tfidf = TFIDF(blob_dict.values())
        for i in range(8535):
            if i in blob_dict:
                output_file = "./data/tfidf-dat/%04d.dat" % i
                print ("Working on %i article!" % i)
                tf_idf_dict = tfidf.get_tf_idf(blob_dict[i])
                DictHelper.merge_dict(total_dict, tf_idf_dict)
                tf_idf_dict = {key: float("%.6f" % value) for key, value in tf_idf_dict.items()}
                StoreHelper.store_data(tf_idf_dict, output_file)
        StoreHelper.store_data(total_dict, "./data/tfidf.dat")
Exemple #26
0
    def run_cluster():
        final_vector = [[0 for j in range(310)] for i in range(4980)]
        key_set = StoreHelper.load_data("./resource/feature.dat", {}).keys()
        print("key set length: %i" % len(key_set))

        blob_dict_list = []
        skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
        discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
        education_dict = StoreHelper.load_data("./resource/education.dat", {})
        for i in range(4980):
            text_file = "./data/datascientist/%04d.txt" % i
            context = StoreHelper.read_file(text_file)
            position_helper = PositionHelper(context)
            blob_dict_list.append(position_helper.convert(skills_dict, discipline_dict, education_dict)[4])

        tfidf = TFIDF(blob_dict_list)
        for i in range(4980):
            print("Working on %i article!" % i)
            tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[i])
            # tf_idf_dict = {key: "%.6f" % value for key, value in tf_idf_dict.items()}
            for j in range(310):
                if key_set[j] in tf_idf_dict:
                    final_vector[i][j] = tf_idf_dict[key_set[j]]
        StoreHelper.store_data(final_vector, "./data/vectors.dat")
Exemple #27
0
 def convert_position():
     skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
     print ("Get %i words from %s" %(len(skills_dict), "skills dict"))
     discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
     print("Get %i words from %s" % (len(discipline_dict), "discipline_dict"))
     education_dict = StoreHelper.load_data("./resource/education.dat", {})
     print("Get %i words from %s" % (len(education_dict), "education_dict"))
     responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {})
     print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict"))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("working on file %s" % text_file)
             word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, [])
             word_data = "./data/result_dict/%04d.dat" % i
             word_text = "./data/result_dict/%04d.txt" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context, word_list)
             result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat')
             StoreHelper.save_file(result_dict, word_text)
             StoreHelper.store_data(result_dict, word_data)
Exemple #28
0
 def cluster_features():
     feature_vector_dict = StoreHelper.load_data('feature_vector.dat', {})
     for feature in feature_vector_dict:
         print ("Running cluster for %s" % feature)
         Main.cluster_with_birch(feature_vector_dict[feature])
         Main.generate_csv_file(value_with_01=True, file_name=feature, select_feature=feature)
Exemple #29
0
 def generate_feature_list():
     vector_data = StoreHelper.load_data('vector.dat', [])
     vector_dict = {'year': vector_data[0], 'education': vector_data[1], 'major': vector_data[2],
                    'skill': vector_data[3], 'responsibility': vector_data[4]}
     StoreHelper.save_file(vector_dict, 'vector.txt')
Exemple #30
0
 def cluster_with_birch(position_dict=None):
     if position_dict is None:
         position_dict = StoreHelper.load_data("./data/position_vector_01.dat", {})
     _vector_list = position_dict.values()
     _index_list = position_dict.keys()
     ClusterHelper.birch_cluster(_vector_list, _index_list)