Beispiel #1
0
	def create_index(self):
		file_names = listdir(DOWNLOAD_DIR)
		i = 0 
		total = [] 
		for name in file_names[1:]:
			i = i + 1
			File = open(DOWNLOAD_DIR+"/"+name,"r")
			raw_text = remove_punc(clean_html(File.read()))
			text = word_tokenize(raw_text)
			File.close()
			#this is to remove punctuation,stop words, plural to single, upppercase to lowercase 
			text_stemmed = [((plural_to_sing(word.lower()))) for word in text] 
			text_stemmed = [rm_stop_words(k) for k in text_stemmed if rm_stop_words(k) is not ""]
			unique_tokens = list(set(text_stemmed))
			tokens = [ (j,name,text_stemmed.count(j)) for j in unique_tokens ]
			total.extend(tokens)
		pickle.dump( total, open( "index.p", "wb" ) )
Beispiel #2
0
def create_index():
	file_names = listdir(DOWNLOAD_DIR)
	i = 0 
	total = [] 
	for name in file_names[1:]:
		i = i + 1
		File = open(DOWNLOAD_DIR+"/"+name,"r")
		txt = File.read()
		raw_text = remove_punc(clean_html(txt))
		title,job,loc = get_loc_job_title(txt)
		text = word_tokenize(raw_text)
		File.close()
		#this is to remove punctuation,stop words, plural to single, upppercase to lowercase 
		text_stemmed = [((plural_to_sing(word.lower()))) for word in text] 
		text_stemmed = [rm_stop_words(k) for k in text_stemmed if rm_stop_words(k) is not ""]
		content =  " ".join(text_stemmed)
		a =  [title,job,loc,content]
		total.append(a)
	return total
Beispiel #3
0
 def create_index(self):
     file_names = listdir(DOWNLOAD_DIR)
     i = 0
     total = []
     for name in file_names[1:]:
         i = i + 1
         File = open(DOWNLOAD_DIR + "/" + name, "r")
         raw_text = remove_punc(clean_html(File.read()))
         text = word_tokenize(raw_text)
         File.close()
         #this is to remove punctuation,stop words, plural to single, upppercase to lowercase
         text_stemmed = [((plural_to_sing(word.lower()))) for word in text]
         text_stemmed = [
             rm_stop_words(k) for k in text_stemmed
             if rm_stop_words(k) is not ""
         ]
         unique_tokens = list(set(text_stemmed))
         tokens = [(j, name, text_stemmed.count(j)) for j in unique_tokens]
         total.extend(tokens)
     pickle.dump(total, open("index.p", "wb"))
Beispiel #4
0
def create_index():
    file_names = listdir(DOWNLOAD_DIR)
    i = 0
    total = []
    for name in file_names[1:]:
        i = i + 1
        File = open(DOWNLOAD_DIR + "/" + name, "r")
        txt = File.read()
        raw_text = remove_punc(clean_html(txt))
        title, job, loc = get_loc_job_title(txt)
        text = word_tokenize(raw_text)
        File.close()
        #this is to remove punctuation,stop words, plural to single, upppercase to lowercase
        text_stemmed = [((plural_to_sing(word.lower()))) for word in text]
        text_stemmed = [
            rm_stop_words(k) for k in text_stemmed
            if rm_stop_words(k) is not ""
        ]
        content = " ".join(text_stemmed)
        a = [title, job, loc, content]
        total.append(a)
    return total