コード例 #1
0
ファイル: analysis.py プロジェクト: dixon-kun/MonsterCrawler
    def run_script():
        # Step 1, read url from text file
        crawl_dict = StoreHelper.parse_file("./resource/url_list")

        # step 2
        total_dict = {}
        for location, url_list in crawl_dict.items():
            file_name = "./data/post/%s.dat" % location
            print (file_name)
            if StoreHelper.is_file_exist(file_name):
                total_dict.update(Main.get_frequency_from_file(file_name))

        # sort dict
        total_dict = sorted(total_dict.items(), key=operator.itemgetter(1), reverse=True)
        StoreHelper.store_data(total_dict, "word_frequency.dat")
コード例 #2
0
ファイル: analysis.py プロジェクト: dixon-kun/MonsterCrawler
 def generate_all_text():
     crawl_dict = StoreHelper.parse_file("./resource/url_list")
     count_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         for url, web_source in positions:
             if 'data scientist' in web_source.lower():
                 text_content = HTMLHelper.get_text(web_source)
                 # text_dict = WordFrequency.get_frequency_dict(text_content)
                 # output = [str(item) for item in text_dict]
                 # output.extend([" ", text_content, " ",  url])
                 StoreHelper.save_file(text_content, "./data/datascientist/%04d.txt" % count_numbers)
                 count_numbers += 1
             else:
                 print ("Data Scientist not found in %s!" % url)