def readFromPath(path): collection = [] filename_list = [] for foldername in os.listdir(path): if os.path.isdir(path+"/"+foldername): if foldername != "FARNON": complete_path = path+"/"+ foldername for filename in os.listdir(complete_path): word_set = set() if filename not in not_to_read: f = open(complete_path+"/"+filename, errors="ignore") for line in f: line_words = line.split() line_stripped = util.removePunctuation(line_words) line_list = util.lemmatization(line_stripped) for word in line_list: word_set.add(word) collection.append((filename,word_set)) else: word_set = set() complete_path = path+"/"+ foldername #print("complete_path:",complete_path) if foldername not in not_to_read: f = open(complete_path, errors="ignore") for line in f: line_words = line.split() line_stripped = util.removePunctuation(line_words) line_list = util.lemmatization(line_stripped) for word in line_list: word_set.add(word) collection.append((foldername,word_set)) print("Corpus collection done") return collection
def parseJSON(): data = util.loadJSON(constants.JSON_FILE) data_index = [] for obj in data: data_dict = dict() temp_text = util.removePunctuation(str(data[obj]['text'])) stopped_temp_text = util.removeStopWords(temp_text, constants.STOP_LIST) temp_length = len(temp_text.split(" ")) data_dict['text'] = temp_text.lower() data_dict['doc_length'] = temp_length data_dict['doc_length_stopped'] = len(stopped_temp_text.split(" ")) meta_data = { "index": { "_index": constants.INDEX_NAME, "_type": constants.TYPE_NAME, "_id": str(obj) } } data_index.append(meta_data) data_index.append(data_dict) print "Complete JSON parsed..." return data_index
def createDictFromRawText(filename): stoplist = constants.STOP_LIST + list(string.ascii_lowercase) dictionary = corpora.Dictionary \ (util.removePunctuation(line.encode('utf-8', 'ignore') \ .lower()).split() for line in ap_corpus) stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(stop_ids + once_ids) print "Removal of stop words done." dictionary.compactify() print "Dictionary created." dictionary.save(filename)
def weightedTfIdfScore(path, query_word_list, tf_idf_score, a): file_title_dict = readHTML.findFileTitleDict(path) for document in file_title_dict.keys(): title = file_title_dict[document] title_list = title.split() title_stripped = util.removePunctuation(title_list) title_list = util.lemmatization(title_stripped) weightedScore = tf_idf_score[document] for word in query_word_list: if word in title_list: weightedScore += (tf_idf_score[document] * a) tf_idf_score[document] = weightedScore return tf_idf_score
path = os.getcwd() + '\\20_newsgroups' dict_tokens = {} collection, filename_list = util.readFromPath(path) for tupleVal in collection: filename = tupleVal[0] text = tupleVal[1] # Tokenization - word_tokenize tokens = nltk.word_tokenize(text) # Punctuation Removal stripped = util.removePunctuation(tokens) # Lemmatization lemmatized_words = util.lemmatization(stripped) # Stopword removal filtered_text = util.removeStopwords(lemmatized_words) # Removing duplicate words from the text unique_words = list(set(filtered_text)) # Creating inverted index # Structure - {word, (frequency, postingList)} for w in unique_words: if w in dict_tokens.keys():
def streamAllDocs(): for doc_collection in ['corpus.dat']: countDoc = 0 startDoc = False endDoc = False startText = False endText = False with open(doc_collection) as f: for line in f: if not startDoc: match = re.findall(r'<DOC>', line) if len(match) > 0: if match[0] == '<DOC>': countChunks = 0 countDoc += 1 startDoc = True endText = False textChunk = [] if startDoc: id_match = re.findall(r"<DOCNO>(.*?)</DOCNO>", line) if len(id_match) > 0: dict_id_val = id_match[0].strip() start_text_match = re.findall(r"<TEXT>", line) if len(start_text_match) > 0: countChunks += 1 startText = True endText = False if startText and (not endText): if not (line.strip() == "<TEXT>" or line.strip() == "</TEXT>"): textChunk.append(line.strip()) print re.findall(r"\w+\.?\w*", line) end_text_match = re.findall(r'</TEXT>', line) if len(end_text_match) > 0: startText = False endText = True end_match = re.findall(r"</DOC>", line) if len(end_match) > 0: endDoc = True startDoc = False _, psw_text_len, psw_text_bilen = advancedWarfare(textChunk) if constants.ADVANCED_PRE_PROCESSING: final_text, _, _ = advancedWarfare(textChunk) else: final_text = util.alterSpaces(util.removePunctuation(" ".join(textChunk))) if constants.STREAM: yield { "_index": constants.INDEX_NAME, "_type": constants.TYPE_NAME, "_id": dict_id_val, "_source": { "text": " ".join(textChunk), ## "bi_doc_length" : psw_text_bilen, "doc_length": psw_text_len, } } else: yield { dict_id_val: { 'text': final_text } } textChunk = []
def createTempIndex(corpusChunk): global vocab global v global no_of_docs global total_tokens visited = set() coll_count = 0 iteration = 0 temp_ind = dict() doc_count = 0 for collection in corpusChunk: print "In collection", collection currLine = '' blah = '' startDoc = False endDoc = False startText = False endText = False coll_count += 1 with open(collection) as f: for line in f: if not startDoc: match = re.findall(r'<DOC>', line) if len(match) > 0: if match[0] == '<DOC>': startDoc = True endText = False textChunk = [] if startDoc: id_match = re.findall(r"<DOCNO>(.*?)</DOCNO>", line) if len(id_match) > 0: curr_doc_no = id_match[0].strip() dict_id_val = DOC_ID_MAP[curr_doc_no] no_of_docs += 1 doc_count += 1 start_text_match = re.findall(r"<TEXT>", line) if len(start_text_match) > 0: startText = True endText = False if startText and (not endText): if not (line.strip() == "<TEXT>" or \ line.strip() == "</TEXT>"): currLine = line.strip() ## Text normalize currLine = currLine.lower() currLine = util.removePunctuation(currLine) currLine = unicode(currLine, 'utf-8') currLine.decode("utf-8", 'ignore') textChunk.append(currLine) end_text_match = re.findall(r'</TEXT>', line) if len(end_text_match) > 0: startText = False endText = True end_match = re.findall(r"</DOC>", line) if len(end_match) > 0: endDoc = True startDoc = False tokens = re.findall(constants.TOKENIZING_REGEX, \ " ".join(textChunk)) real_tokens = tokens if constants.REMOVE_STOP_WORDS: tokens = util.removeStopWords(tokens, constants.STOP_LIST) current_doc_len = len(tokens) doc_len_map.update({dict_id_val: current_doc_len}) total_tokens += current_doc_len if constants.STEM_DATA: visited = set() stemmed_tokens = util.stemTokens(tokens) for token, stemmed_token in zip(tokens, stemmed_tokens): if token not in vocab: vocab.add(token) v.write(token + constants.ENDLINE) if stemmed_token not in visited: visited.add(stemmed_token) term_positions = termPositions(stemmed_tokens, stemmed_token) if not temp_ind.get(stemmed_token): temp_ind[stemmed_token] = \ [[dict_id_val, term_positions]] else: temp_ind[stemmed_token].append \ ([dict_id_val, term_positions]) else: visited = set() for token in tokens: if token not in vocab: vocab.add(token) v.write(token + constants.ENDLINE) if token not in visited: visited.add(token) ## tf = all_tfs[token] term_positions = termPositions(real_tokens, token) if not temp_ind.get(token): temp_ind[token] = \ [[dict_id_val, term_positions]] else: temp_ind[token].append \ ([dict_id_val, term_positions]) textChunk = [] if doc_count == 1000: print f.tell() return temp_ind
f = open(path, "r") #data = f.read() lines = f.readlines() for line in lines: wordList.append(line.strip()) return wordList path = os.getcwd() + '\\english2\\english2.txt' wordList = readFromPath(path) print(len(wordList)) input_string = "i love cricket, 'but utna ni like karta" k = 5 input_wordList = input_string.split() input_wordList = util.removePunctuation(input_wordList) input_wordList = util.lemmatization(input_wordList) for input_word in input_wordList: dict_suggestions = {} if input_word not in wordList: for word in wordList: dist = editDistance(input_word, word) dict_suggestions[word] = dist sorted_d = sorted(dict_suggestions.items(), key=operator.itemgetter(1)) i = 0 print("Suggestions for the word:", input_word) while i < k: print(sorted_d[i]) i = i + 1