encoding='UTF-8') #前処理(まとめ) for i, file in enumerate(tweets_paths): try: fi = codecs.open(tweets_paths[i - 1], 'r', 'utf8') tweet_datas = json.load(fi) print(str(i + 1) + "×" + str(len(tweet_datas)) + "Tweets") except Exception: pass for j, tweet_data in enumerate(tweet_datas): text = tweet_data["text"].replace("\r", "").replace("\n", "") f_txt.write(text + "\n") text = text.replace(",", "") """文字列削除&単語分割""" text = Delete.delete_twitter(text) #文字列削除 text = Wakati.wakati(text) #分かち書き f_pre.write(text) with open(os.path.join( save_dir_corpus_koko, save_dir_name + "_pre_" + str(filenumber) + ".txt"), 'w', encoding='UTF-8') as file_koko: #前処理(個々) file_koko.write(text) filenumber += 1 fi.close() f_txt.close() f_pre.close()
count10 = 0 count11 = 0 for list in filelist: wordsnumber = Count.words_count(list) file = list.split("\\")[-1] totalcount += 1 if wordsnumber <= 8: count10 += 1 with open(list,'r',encoding='UTF-8') as file_in: text = file_in.read() sentences = Sentence.sentence(text) with open(os.path.join(OUT10,file),'w',encoding='UTF-8')as file_out: for sentence in sentences: file_out.write(Delete.delete_wikipedia(sentence) + "\n") elif wordsnumber >= 9 and wordsnumber <= 15: count11 += 1 with open(list,'r',encoding='UTF-8') as file_in: text = file_in.read() sentences = Sentence.sentence(text) with open(os.path.join(OUT11,file),'w',encoding='UTF-8')as file_out: for sentence in sentences: file_out.write(Delete.delete_wikipedia(sentence) + "\n") elif wordsnumber >= 16 and wordsnumber <= 30: count1 += 1 with open(list,'r',encoding='UTF-8') as file_in: text = file_in.read() sentences = Sentence.sentence(text) with open(os.path.join(OUT1,file),'w',encoding='UTF-8')as file_out: for sentence in sentences:
count11 = 0 for list in filelist: wordsnumber = Count.words_count(list) file = list.split("\\")[-1] totalcount += 1 if wordsnumber <= 8: count10 += 1 with open(list, 'r', encoding='UTF-8') as file_in: text = file_in.read() sentences = Sentence.sentence_novel(text) with open(os.path.join(OUT10, file), 'w', encoding='UTF-8') as file_out: for sentence in sentences: file_out.write(Delete.delete_aozora(sentence) + "\n") elif wordsnumber >= 9 and wordsnumber <= 15: count11 += 1 with open(list, 'r', encoding='UTF-8') as file_in: text = file_in.read() sentences = Sentence.sentence_novel(text) with open(os.path.join(OUT11, file), 'w', encoding='UTF-8') as file_out: for sentence in sentences: file_out.write(Delete.delete_aozora(sentence) + "\n") elif wordsnumber >= 16 and wordsnumber <= 30: count1 += 1 with open(list, 'r', encoding='UTF-8') as file_in: text = file_in.read() sentences = Sentence.sentence_novel(text) with open(os.path.join(OUT1, file), 'w',
from tqdm import tqdm import sys sys.path.append("..") from Preprocessing import File_operation from Preprocessing import Delete from Preprocessing import Sentence INPUT_DIR = XXXXXXXXXX dirlist = os.listdir(INPUT_DIR) OUTPUT_DIR = XXXXXXXXXX for dir in dirlist: print(dir) filelist = File_operation.get_all_paths(os.path.join(INPUT_DIR, dir)) for i, file in enumerate(tqdm(filelist)): with open(file, 'r', encoding='UTF-8') as file_in: lines = file_in.readlines() title = lines[2] title = Delete.title(title) OUT = os.path.join(OUTPUT_DIR, dir) os.makedirs(OUT, exist_ok=True) with open(os.path.join(OUT, title + ".txt"), 'w', encoding='UTF-8') as file_out: for line in lines[3:]: if not line == "\n": sentencelists = Sentence.sentence_novel(line) for sentence in sentencelists: text = Delete.delete_wikipedia(sentence) if not text == "": file_out.write(text + "\n")
def main(): """取得辞書と保存先の設定""" eng = "World Encyclopedia" jp = "世界大百科事典 第2版" OUTPUT_DIR = XXXXXXXXXX opener = urllib.request.build_opener() opener.addheaders = [( 'User-Agent', "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" ), ('Accept-Language', 'ja,en-us;q=0.7,en;q=0.3')] urllib.request.install_opener(opener) """コトバンクの辞書一覧ページ""" URL = 'https://kotobank.jp/dictionary/' try: soup = BeautifulSoup(urllib.request.urlopen(URL).read(), "html.parser") except Exception as e: print(e) print("強制終了(辞書一覧ページ)") sys.exit() for i, dic_link in enumerate(soup.findAll("a")): if "dictionary" in dic_link.get( "href") and not "https" in dic_link.get( "href") and not dic_link.get("href") == r"/dictionary/": dic_title = str(dic_link.text) dic_title = Delete.title(dic_title) #辞書の名前(日本語) dic_eng = dic_link.get("href").split(r"/")[2] #辞書の名前(英語) """指定した辞書の取得""" if dic_title == jp: print("取得対象:" + dic_title) """保存先の決定""" OUTPUT = OUTPUT_DIR + "/Raw" os.makedirs(OUTPUT, exist_ok=True) """「最初から」もしくは「続きから」辞書を取得""" if not os.path.exists(os.path.join(OUTPUT_DIR, eng + ".txt")): print("Getting from the beginning.") totalpage = 0 #ページ数 main_URL = 'https://kotobank.jp' + str( dic_link.get("href")) with open(os.path.join(OUTPUT_DIR, eng + ".txt"), 'w', encoding='UTF-8') as fi_info: fi_info.write(str(totalpage) + "\n" + main_URL) else: print("Getting from the continuation.") with open(os.path.join(OUTPUT_DIR, eng + ".txt"), 'r') as fi_info: infos = fi_info.readlines() totalpage = int(infos[0]) - 1 main_URL = infos[1] """ページ内に「次へ」があったらループ""" judge = True #「次へ」の有無 while judge: totalpage += 1 print(str(totalpage) + "ページ目") """辞書内のページ""" try: search_soup = BeautifulSoup( urllib.request.urlopen(main_URL).read(), "html.parser") except: with open(os.path.join(OUTPUT_DIR, eng + ".txt"), 'w', encoding='UTF-8') as fi_info: fi_info.write(str(totalpage) + "\n" + main_URL) print("強制終了(辞書内の見出しページ)") sys.exit() linklists = search_soup.findAll("a", rel="dic_" + str(dic_eng)) print("リンク数:" + str(len(linklists))) for link in linklists: """辞書の各項目のページ""" item_URL = 'https://kotobank.jp' + str( link.get('href')) try: item_soup = BeautifulSoup( urllib.request.urlopen(item_URL).read(), "html.parser") except Exception as e: with open(os.path.join(OUTPUT_DIR, eng + ".txt"), 'w', encoding='UTF-8') as fi_info: fi_info.write(str(totalpage) + "\n" + main_URL) print("強制終了(項目のページ)") print(e) sys.exit() """タイトルとテキストの取得""" dics = item_soup.find_all('article') for dic in dics: if dic_title in str(dic("h2")): title = str(dic("h3")[0]) title = BeautifulSoup(title, "lxml") title = title.get_text() title = Delete.title(title) if len(dic("section")) == 1: text = str(dic("section")[0]) else: text = "" for x in dic("section"): text += str(x) text = BeautifulSoup(text, "lxml") text = text.get_text() with open(os.path.join(OUTPUT, title + ".txt"), 'w', encoding='UTF-8') as file_out: print(title) file_out.write(text) """「次へ」の有無判断""" nexts = search_soup.find("li", class_="next") if not nexts == None: judge = True main_URL = 'https://kotobank.jp' + str( nexts.find("a").get("href")) else: judge = False
def main(): """設定ファイルの読み込み""" inifile = configparser.ConfigParser( allow_no_value=True, interpolation=configparser.ExtendedInterpolation()) inifile.readfp(codecs.open("./RakutenIchiba_Crawler.ini", 'r', 'utf8')) """保存先のパス""" save_dir_path = inifile.get('other_settings', 'Raw_save_dir_path') save_dir_text_path = inifile.get('other_settings', 'Text_save_dir_path') query = {} """共通パラメーターの設定""" query['applicationId'] = inifile.get('tokens', 'ApplicationId') query['affiliateId'] = inifile.get('tokens', 'AffiliateId') """サービス固有パラメーターの設定""" Req_URL = inifile.get('search_params', 'Req_URL') keyword = inifile.get('search_params', 'keyword') append_keywords = inifile.get('search_params', 'append_keywords') if append_keywords == "": search_list = [keyword] else: search_list = search_words_generator(keyword, append_keywords.split(" ")) page = int(inifile.get('search_params', 'page')) query['hits'] = int(inifile.get('search_params', 'hits')) # query['NGKeyword'] = inifile.get('search_params','NGKeyword') query['orFlag'] = int(inifile.get('search_params', 'orFlag')) # query['imageFlag'] = int(inifile.get('search_params','imageFlag')) # query['hasReviewFlag'] = int(inifile.get('search_params','hasReviewFlag')) # query['itemCode'] = inifile.get('search_params','itemCode') # query['shopCode'] = inifile.get('search_params','shopCode') # query['genreId'] = inifile.get('search_params','genreId') # query['tagId'] = inifile.get('search_params','tagId') """検索実行""" for i, keyword in enumerate(search_list): print("検索ワード:" + keyword) query['keyword'] = keyword """保存先の設定""" if len(search_list) == 1 or keyword == search_list[0]: save_dir_name = keyword elif not len(search_list) == 1 and query['orFlag'] == 0: save_dir_name = keyword + "_AND" elif not len(search_list) == 1 and query['orFlag'] == 1: save_dir_name = keyword + "_OR" save_dir = os.path.join(save_dir_path, save_dir_name) os.makedirs(save_dir, exist_ok=True) save_dir_text = os.path.join(save_dir_text_path, save_dir_name) os.makedirs(save_dir_text, exist_ok=True) """ページごとに取得""" for j in range(page): print("page:" + str(j + 1)) query['page'] = j + 1 res = search(Req_URL, query) print("取得件数:" + str(len(res["Items"]))) with codecs.open( os.path.join(save_dir, "products" + str(j + 1) + ".json"), 'w', 'utf8') as fo: json.dump(res, fo, sort_keys=True, indent=4) for k in range(len(res["Items"])): Item_dic = res["Items"][k]["Item"] title = Delete.title(Item_dic["itemName"]) text = Item_dic["itemCaption"] with open(os.path.join(save_dir_text, title + ".txt"), 'w', encoding='UTF-8') as file_text: file_text.write(text) sleep(1) #アクセス制限対策
totalcount = 0 #総数 for list in filelist: print("ファイル名:" + list) with open(os.path.join(INPUT_DIR, list), 'r', encoding='UTF-8') as file_in: #読み込み用 lines = file_in.readlines() for i, line in enumerate(tqdm(lines)): if line[0:4] == "<doc": begin = i #初めの行数 totalcount += 1 title = line.split("=")[-1] title = re.sub(">", "", title) title = re.sub("\"", "", title) title = Delete.title(title) title = title + "(" + str(totalcount) + ")" if line[0:4] == "</do": end = i #終わりの行数 text = "" for j in range(begin + 2, end): if lines[j][0] == "\n": lines[j][0] == "" else: text += lines[j] with open(os.path.join(OUTPUT_DIR, title + ".txt"), 'w',