def create_index(self): file_names = listdir(DOWNLOAD_DIR) i = 0 total = [] for name in file_names[1:]: i = i + 1 File = open(DOWNLOAD_DIR+"/"+name,"r") raw_text = remove_punc(clean_html(File.read())) text = word_tokenize(raw_text) File.close() #this is to remove punctuation,stop words, plural to single, upppercase to lowercase text_stemmed = [((plural_to_sing(word.lower()))) for word in text] text_stemmed = [rm_stop_words(k) for k in text_stemmed if rm_stop_words(k) is not ""] unique_tokens = list(set(text_stemmed)) tokens = [ (j,name,text_stemmed.count(j)) for j in unique_tokens ] total.extend(tokens) pickle.dump( total, open( "index.p", "wb" ) )
def create_index(): file_names = listdir(DOWNLOAD_DIR) i = 0 total = [] for name in file_names[1:]: i = i + 1 File = open(DOWNLOAD_DIR+"/"+name,"r") txt = File.read() raw_text = remove_punc(clean_html(txt)) title,job,loc = get_loc_job_title(txt) text = word_tokenize(raw_text) File.close() #this is to remove punctuation,stop words, plural to single, upppercase to lowercase text_stemmed = [((plural_to_sing(word.lower()))) for word in text] text_stemmed = [rm_stop_words(k) for k in text_stemmed if rm_stop_words(k) is not ""] content = " ".join(text_stemmed) a = [title,job,loc,content] total.append(a) return total
def create_index(self): file_names = listdir(DOWNLOAD_DIR) i = 0 total = [] for name in file_names[1:]: i = i + 1 File = open(DOWNLOAD_DIR + "/" + name, "r") raw_text = remove_punc(clean_html(File.read())) text = word_tokenize(raw_text) File.close() #this is to remove punctuation,stop words, plural to single, upppercase to lowercase text_stemmed = [((plural_to_sing(word.lower()))) for word in text] text_stemmed = [ rm_stop_words(k) for k in text_stemmed if rm_stop_words(k) is not "" ] unique_tokens = list(set(text_stemmed)) tokens = [(j, name, text_stemmed.count(j)) for j in unique_tokens] total.extend(tokens) pickle.dump(total, open("index.p", "wb"))
def create_index(): file_names = listdir(DOWNLOAD_DIR) i = 0 total = [] for name in file_names[1:]: i = i + 1 File = open(DOWNLOAD_DIR + "/" + name, "r") txt = File.read() raw_text = remove_punc(clean_html(txt)) title, job, loc = get_loc_job_title(txt) text = word_tokenize(raw_text) File.close() #this is to remove punctuation,stop words, plural to single, upppercase to lowercase text_stemmed = [((plural_to_sing(word.lower()))) for word in text] text_stemmed = [ rm_stop_words(k) for k in text_stemmed if rm_stop_words(k) is not "" ] content = " ".join(text_stemmed) a = [title, job, loc, content] total.append(a) return total