def InputWebKB(directory): # ---------------------------------------------------------------------------------------------- # ●WebKBデータを読み込みdatabaseに格納する。 # ●ディレクトリ構成は以下のようにする。 # +Cornell ←directoryにはこのディレクトリを指定。DB=InputWebKB('Cornell') # ├ course # ├ : department, faculty, other, project, staff # └ student # ●実験用: 'C:\Users\hosotac\Documents\修士1年後期\WebKB\Cornell\course\ # http_^^cs.cornell.edu^Info^Courses^Current^CS415^CS414.html' # ---------------------------------------------------------------------------------------------- database = [] index = 0 # 引数directoryに/が含まれるか確認 if re.search(r'/$', directory) is None: directory = directory + '/' for i in range(len(categories)): category_dir = directory + categories[i] filelist = glob.glob(category_dir + '/*') print category_dir + str(len(filelist)) for filename in filelist: noun = BeautifulNoun(open(filename)) token = set(noun.alltokens()) transaction = [index, i, token] database.append(transaction) index += 1 return database
def __init__(self, text): self.text = text # Noiseの削除 # ストップワード # レマタイズ noun = BeautifulNoun(self.text, TYPE='txt', LANGUAGE='en', stem='y', lemm='n', stopwords='y') allterms = [] for sentence_tokens in noun.tokens: # print sentence_tokens allterms = allterms + noun.make_bigrams(sentence_tokens)\ + noun.make_trigrams(sentence_tokens) termfreqlist = [] for term in allterms: m = 0 for i in range(len(termfreqlist)): if term == termfreqlist[i][0]: termfreqlist[i][1] += 1 m = 1 if m == 0: termfreqlist.append([term, 1]) # 頻度を利用して熟語を取り出す # sort(termfreqlist) by count and term-length. termfreqlist.sort(key=lambda x:len(x[0]), reverse=True) termfreqlist.sort(key=lambda x:x[1], reverse=True) print termfreqlist print """# Making Dterms.