Example #1
0
             encoding='UTF-8')  #前処理(まとめ)

for i, file in enumerate(tweets_paths):
    try:
        fi = codecs.open(tweets_paths[i - 1], 'r', 'utf8')
        tweet_datas = json.load(fi)
        print(str(i + 1) + "×" + str(len(tweet_datas)) + "Tweets")
    except Exception:
        pass

    for j, tweet_data in enumerate(tweet_datas):
        text = tweet_data["text"].replace("\r", "").replace("\n", "")
        f_txt.write(text + "\n")
        text = text.replace(",", "")
        """文字列削除&単語分割"""
        text = Delete.delete_twitter(text)  #文字列削除

        text = Wakati.wakati(text)  #分かち書き
        f_pre.write(text)

        with open(os.path.join(
                save_dir_corpus_koko,
                save_dir_name + "_pre_" + str(filenumber) + ".txt"),
                  'w',
                  encoding='UTF-8') as file_koko:  #前処理(個々)
            file_koko.write(text)

        filenumber += 1
    fi.close()
f_txt.close()
f_pre.close()
Example #2
0
count10 = 0
count11 = 0

for list in filelist:
    wordsnumber = Count.words_count(list)
    file = list.split("\\")[-1]
    totalcount += 1

    if wordsnumber <= 8:
        count10 += 1
        with open(list,'r',encoding='UTF-8') as file_in:
            text = file_in.read()
            sentences = Sentence.sentence(text)
            with open(os.path.join(OUT10,file),'w',encoding='UTF-8')as file_out:
                for sentence in sentences:
                    file_out.write(Delete.delete_wikipedia(sentence) + "\n")
    elif wordsnumber >= 9 and wordsnumber <= 15:
        count11 += 1
        with open(list,'r',encoding='UTF-8') as file_in:
            text = file_in.read()
            sentences = Sentence.sentence(text)
            with open(os.path.join(OUT11,file),'w',encoding='UTF-8')as file_out:
                for sentence in sentences:
                    file_out.write(Delete.delete_wikipedia(sentence) + "\n")
    elif wordsnumber >= 16 and wordsnumber <= 30:
        count1 += 1
        with open(list,'r',encoding='UTF-8') as file_in:
            text = file_in.read()
            sentences = Sentence.sentence(text)
            with open(os.path.join(OUT1,file),'w',encoding='UTF-8')as file_out:
                for sentence in sentences:
Example #3
0
count11 = 0

for list in filelist:
    wordsnumber = Count.words_count(list)
    file = list.split("\\")[-1]
    totalcount += 1

    if wordsnumber <= 8:
        count10 += 1
        with open(list, 'r', encoding='UTF-8') as file_in:
            text = file_in.read()
            sentences = Sentence.sentence_novel(text)
            with open(os.path.join(OUT10, file), 'w',
                      encoding='UTF-8') as file_out:
                for sentence in sentences:
                    file_out.write(Delete.delete_aozora(sentence) + "\n")
    elif wordsnumber >= 9 and wordsnumber <= 15:
        count11 += 1
        with open(list, 'r', encoding='UTF-8') as file_in:
            text = file_in.read()
            sentences = Sentence.sentence_novel(text)
            with open(os.path.join(OUT11, file), 'w',
                      encoding='UTF-8') as file_out:
                for sentence in sentences:
                    file_out.write(Delete.delete_aozora(sentence) + "\n")
    elif wordsnumber >= 16 and wordsnumber <= 30:
        count1 += 1
        with open(list, 'r', encoding='UTF-8') as file_in:
            text = file_in.read()
            sentences = Sentence.sentence_novel(text)
            with open(os.path.join(OUT1, file), 'w',
from tqdm import tqdm
import sys
sys.path.append("..")
from Preprocessing import File_operation
from Preprocessing import Delete
from Preprocessing import Sentence

INPUT_DIR = XXXXXXXXXX
dirlist = os.listdir(INPUT_DIR)
OUTPUT_DIR = XXXXXXXXXX

for dir in dirlist:
    print(dir)
    filelist = File_operation.get_all_paths(os.path.join(INPUT_DIR, dir))
    for i, file in enumerate(tqdm(filelist)):
        with open(file, 'r', encoding='UTF-8') as file_in:
            lines = file_in.readlines()
            title = lines[2]
            title = Delete.title(title)
            OUT = os.path.join(OUTPUT_DIR, dir)
            os.makedirs(OUT, exist_ok=True)
            with open(os.path.join(OUT, title + ".txt"), 'w',
                      encoding='UTF-8') as file_out:
                for line in lines[3:]:
                    if not line == "\n":
                        sentencelists = Sentence.sentence_novel(line)
                        for sentence in sentencelists:
                            text = Delete.delete_wikipedia(sentence)
                            if not text == "":
                                file_out.write(text + "\n")
Example #5
0
def main():
    """取得辞書と保存先の設定"""
    eng = "World Encyclopedia"
    jp = "世界大百科事典 第2版"
    OUTPUT_DIR = XXXXXXXXXX

    opener = urllib.request.build_opener()
    opener.addheaders = [(
        'User-Agent',
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
    ), ('Accept-Language', 'ja,en-us;q=0.7,en;q=0.3')]

    urllib.request.install_opener(opener)
    """コトバンクの辞書一覧ページ"""
    URL = 'https://kotobank.jp/dictionary/'
    try:
        soup = BeautifulSoup(urllib.request.urlopen(URL).read(), "html.parser")
    except Exception as e:
        print(e)
        print("強制終了(辞書一覧ページ)")
        sys.exit()

    for i, dic_link in enumerate(soup.findAll("a")):
        if "dictionary" in dic_link.get(
                "href") and not "https" in dic_link.get(
                    "href") and not dic_link.get("href") == r"/dictionary/":

            dic_title = str(dic_link.text)
            dic_title = Delete.title(dic_title)  #辞書の名前(日本語)

            dic_eng = dic_link.get("href").split(r"/")[2]  #辞書の名前(英語)
            """指定した辞書の取得"""
            if dic_title == jp:
                print("取得対象:" + dic_title)
                """保存先の決定"""
                OUTPUT = OUTPUT_DIR + "/Raw"
                os.makedirs(OUTPUT, exist_ok=True)
                """「最初から」もしくは「続きから」辞書を取得"""
                if not os.path.exists(os.path.join(OUTPUT_DIR, eng + ".txt")):
                    print("Getting from the beginning.")
                    totalpage = 0  #ページ数
                    main_URL = 'https://kotobank.jp' + str(
                        dic_link.get("href"))
                    with open(os.path.join(OUTPUT_DIR, eng + ".txt"),
                              'w',
                              encoding='UTF-8') as fi_info:
                        fi_info.write(str(totalpage) + "\n" + main_URL)
                else:
                    print("Getting from the continuation.")
                    with open(os.path.join(OUTPUT_DIR, eng + ".txt"),
                              'r') as fi_info:
                        infos = fi_info.readlines()
                        totalpage = int(infos[0]) - 1
                        main_URL = infos[1]
                """ページ内に「次へ」があったらループ"""
                judge = True  #「次へ」の有無
                while judge:
                    totalpage += 1
                    print(str(totalpage) + "ページ目")
                    """辞書内のページ"""
                    try:
                        search_soup = BeautifulSoup(
                            urllib.request.urlopen(main_URL).read(),
                            "html.parser")
                    except:
                        with open(os.path.join(OUTPUT_DIR, eng + ".txt"),
                                  'w',
                                  encoding='UTF-8') as fi_info:
                            fi_info.write(str(totalpage) + "\n" + main_URL)
                        print("強制終了(辞書内の見出しページ)")
                        sys.exit()

                    linklists = search_soup.findAll("a",
                                                    rel="dic_" + str(dic_eng))
                    print("リンク数:" + str(len(linklists)))
                    for link in linklists:
                        """辞書の各項目のページ"""
                        item_URL = 'https://kotobank.jp' + str(
                            link.get('href'))
                        try:
                            item_soup = BeautifulSoup(
                                urllib.request.urlopen(item_URL).read(),
                                "html.parser")
                        except Exception as e:
                            with open(os.path.join(OUTPUT_DIR, eng + ".txt"),
                                      'w',
                                      encoding='UTF-8') as fi_info:
                                fi_info.write(str(totalpage) + "\n" + main_URL)
                            print("強制終了(項目のページ)")
                            print(e)
                            sys.exit()
                        """タイトルとテキストの取得"""
                        dics = item_soup.find_all('article')
                        for dic in dics:
                            if dic_title in str(dic("h2")):

                                title = str(dic("h3")[0])
                                title = BeautifulSoup(title, "lxml")
                                title = title.get_text()
                                title = Delete.title(title)

                                if len(dic("section")) == 1:
                                    text = str(dic("section")[0])
                                else:
                                    text = ""
                                    for x in dic("section"):
                                        text += str(x)

                                text = BeautifulSoup(text, "lxml")
                                text = text.get_text()

                                with open(os.path.join(OUTPUT, title + ".txt"),
                                          'w',
                                          encoding='UTF-8') as file_out:
                                    print(title)
                                    file_out.write(text)
                    """「次へ」の有無判断"""
                    nexts = search_soup.find("li", class_="next")
                    if not nexts == None:
                        judge = True
                        main_URL = 'https://kotobank.jp' + str(
                            nexts.find("a").get("href"))
                    else:
                        judge = False
Example #6
0
def main():
    """設定ファイルの読み込み"""
    inifile = configparser.ConfigParser(
        allow_no_value=True,
        interpolation=configparser.ExtendedInterpolation())
    inifile.readfp(codecs.open("./RakutenIchiba_Crawler.ini", 'r', 'utf8'))
    """保存先のパス"""
    save_dir_path = inifile.get('other_settings', 'Raw_save_dir_path')
    save_dir_text_path = inifile.get('other_settings', 'Text_save_dir_path')

    query = {}
    """共通パラメーターの設定"""
    query['applicationId'] = inifile.get('tokens', 'ApplicationId')
    query['affiliateId'] = inifile.get('tokens', 'AffiliateId')
    """サービス固有パラメーターの設定"""
    Req_URL = inifile.get('search_params', 'Req_URL')
    keyword = inifile.get('search_params', 'keyword')
    append_keywords = inifile.get('search_params', 'append_keywords')

    if append_keywords == "":
        search_list = [keyword]
    else:
        search_list = search_words_generator(keyword,
                                             append_keywords.split(" "))

    page = int(inifile.get('search_params', 'page'))
    query['hits'] = int(inifile.get('search_params', 'hits'))

    # query['NGKeyword'] = inifile.get('search_params','NGKeyword')
    query['orFlag'] = int(inifile.get('search_params', 'orFlag'))
    # query['imageFlag'] = int(inifile.get('search_params','imageFlag'))
    # query['hasReviewFlag'] = int(inifile.get('search_params','hasReviewFlag'))

    # query['itemCode'] = inifile.get('search_params','itemCode')
    # query['shopCode'] = inifile.get('search_params','shopCode')
    # query['genreId'] = inifile.get('search_params','genreId')
    # query['tagId'] = inifile.get('search_params','tagId')
    """検索実行"""
    for i, keyword in enumerate(search_list):
        print("検索ワード:" + keyword)
        query['keyword'] = keyword
        """保存先の設定"""
        if len(search_list) == 1 or keyword == search_list[0]:
            save_dir_name = keyword
        elif not len(search_list) == 1 and query['orFlag'] == 0:
            save_dir_name = keyword + "_AND"
        elif not len(search_list) == 1 and query['orFlag'] == 1:
            save_dir_name = keyword + "_OR"

        save_dir = os.path.join(save_dir_path, save_dir_name)
        os.makedirs(save_dir, exist_ok=True)
        save_dir_text = os.path.join(save_dir_text_path, save_dir_name)
        os.makedirs(save_dir_text, exist_ok=True)
        """ページごとに取得"""
        for j in range(page):
            print("page:" + str(j + 1))
            query['page'] = j + 1

            res = search(Req_URL, query)
            print("取得件数:" + str(len(res["Items"])))

            with codecs.open(
                    os.path.join(save_dir, "products" + str(j + 1) + ".json"),
                    'w', 'utf8') as fo:
                json.dump(res, fo, sort_keys=True, indent=4)

            for k in range(len(res["Items"])):
                Item_dic = res["Items"][k]["Item"]
                title = Delete.title(Item_dic["itemName"])
                text = Item_dic["itemCaption"]

                with open(os.path.join(save_dir_text, title + ".txt"),
                          'w',
                          encoding='UTF-8') as file_text:
                    file_text.write(text)

            sleep(1)  #アクセス制限対策
totalcount = 0  #総数
for list in filelist:
    print("ファイル名:" + list)
    with open(os.path.join(INPUT_DIR, list), 'r',
              encoding='UTF-8') as file_in:  #読み込み用
        lines = file_in.readlines()
        for i, line in enumerate(tqdm(lines)):
            if line[0:4] == "<doc":
                begin = i  #初めの行数
                totalcount += 1

                title = line.split("=")[-1]
                title = re.sub(">", "", title)
                title = re.sub("\"", "", title)
                title = Delete.title(title)

                title = title + "(" + str(totalcount) + ")"

            if line[0:4] == "</do":
                end = i  #終わりの行数

                text = ""
                for j in range(begin + 2, end):
                    if lines[j][0] == "\n":
                        lines[j][0] == ""
                    else:
                        text += lines[j]

                with open(os.path.join(OUTPUT_DIR, title + ".txt"),
                          'w',