Ejemplo n.º 1
0
def readFromPath(path):
	collection = []
	filename_list = []
	for foldername in os.listdir(path):
		if os.path.isdir(path+"/"+foldername):
		 	if foldername != "FARNON":
		 		complete_path = path+"/"+ foldername
		 		for filename in os.listdir(complete_path):
		 			word_set = set()
		 			if filename not in not_to_read:
		 				f = open(complete_path+"/"+filename, errors="ignore")
		 				for line in f:
		 					line_words = line.split()
		 					line_stripped = util.removePunctuation(line_words)
		 					line_list = util.lemmatization(line_stripped)
		 					for word in line_list:
		 						word_set.add(word)
		 				collection.append((filename,word_set))
		else:
			word_set = set()
			complete_path = path+"/"+ foldername
			#print("complete_path:",complete_path)
			if foldername not in not_to_read:
				f = open(complete_path, errors="ignore")
				for line in f:
					line_words = line.split()
					line_stripped = util.removePunctuation(line_words)
					line_list = util.lemmatization(line_stripped)
					for word in line_list:
						word_set.add(word)
				collection.append((foldername,word_set))
	print("Corpus collection done")
	return collection
Ejemplo n.º 2
0
def parseJSON():
    data = util.loadJSON(constants.JSON_FILE)
    data_index = []
    for obj in data:
        data_dict = dict()
        temp_text = util.removePunctuation(str(data[obj]['text']))
        stopped_temp_text = util.removeStopWords(temp_text, constants.STOP_LIST)
        temp_length = len(temp_text.split(" "))
        data_dict['text'] = temp_text.lower()
        data_dict['doc_length'] = temp_length
        data_dict['doc_length_stopped'] = len(stopped_temp_text.split(" "))

        meta_data = {
            "index": {
                "_index": constants.INDEX_NAME,
                "_type": constants.TYPE_NAME,
                "_id": str(obj)
            }
        }

        data_index.append(meta_data)
        data_index.append(data_dict)

    print "Complete JSON parsed..."
    return data_index
def createDictFromRawText(filename):
    stoplist = constants.STOP_LIST + list(string.ascii_lowercase)
    dictionary = corpora.Dictionary \
        (util.removePunctuation(line.encode('utf-8', 'ignore') \
                                .lower()).split() for line in ap_corpus)
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
    dictionary.filter_tokens(stop_ids + once_ids)
    print "Removal of stop words done."
    dictionary.compactify()
    print "Dictionary created."
    dictionary.save(filename)
Ejemplo n.º 4
0
def weightedTfIdfScore(path, query_word_list, tf_idf_score, a):
    file_title_dict = readHTML.findFileTitleDict(path)
    for document in file_title_dict.keys():
        title = file_title_dict[document]
        title_list = title.split()
        title_stripped = util.removePunctuation(title_list)
        title_list = util.lemmatization(title_stripped)
        weightedScore = tf_idf_score[document]
        for word in query_word_list:
            if word in title_list:
                weightedScore += (tf_idf_score[document] * a)
        tf_idf_score[document] = weightedScore
    return tf_idf_score
Ejemplo n.º 5
0
path = os.getcwd() + '\\20_newsgroups'

dict_tokens = {}

collection, filename_list = util.readFromPath(path)

for tupleVal in collection:

    filename = tupleVal[0]
    text = tupleVal[1]

    # Tokenization - word_tokenize
    tokens = nltk.word_tokenize(text)

    # Punctuation Removal
    stripped = util.removePunctuation(tokens)

    # Lemmatization
    lemmatized_words = util.lemmatization(stripped)

    # Stopword removal
    filtered_text = util.removeStopwords(lemmatized_words)

    # Removing duplicate words from the text
    unique_words = list(set(filtered_text))

    # Creating inverted index
    # Structure - {word, (frequency, postingList)}

    for w in unique_words:
        if w in dict_tokens.keys():
def streamAllDocs():
    for doc_collection in ['corpus.dat']:
        countDoc = 0
        startDoc = False
        endDoc = False
        startText = False
        endText = False

        with open(doc_collection) as f:
            for line in f:
                if not startDoc:
                    match = re.findall(r'<DOC>', line)
                    if len(match) > 0:
                        if match[0] == '<DOC>':
                            countChunks = 0
                            countDoc += 1
                            startDoc = True
                            endText = False
                            textChunk = []

                if startDoc:
                    id_match = re.findall(r"<DOCNO>(.*?)</DOCNO>", line)
                    if len(id_match) > 0:
                        dict_id_val = id_match[0].strip()

                    start_text_match = re.findall(r"<TEXT>", line)
                    if len(start_text_match) > 0:
                        countChunks += 1
                        startText = True
                        endText = False

                    if startText and (not endText):
                        if not (line.strip() == "<TEXT>" or line.strip() == "</TEXT>"):
                            textChunk.append(line.strip())
                            print re.findall(r"\w+\.?\w*", line)

                    end_text_match = re.findall(r'</TEXT>', line)
                    if len(end_text_match) > 0:
                        startText = False
                        endText = True

                    end_match = re.findall(r"</DOC>", line)

                    if len(end_match) > 0:
                        endDoc = True
                        startDoc = False
                        _, psw_text_len, psw_text_bilen = advancedWarfare(textChunk)
                        if constants.ADVANCED_PRE_PROCESSING:
                            final_text, _, _ = advancedWarfare(textChunk)
                        else:
                            final_text = util.alterSpaces(util.removePunctuation(" ".join(textChunk)))
                        if constants.STREAM:
                            yield {
                                "_index": constants.INDEX_NAME,
                                "_type": constants.TYPE_NAME,
                                "_id": dict_id_val,
                                "_source": {
                                    "text": " ".join(textChunk),
                                    ##                                    "bi_doc_length" : psw_text_bilen,
                                    "doc_length": psw_text_len,
                                }
                            }
                        else:
                            yield {
                                dict_id_val: {
                                    'text': final_text
                                }
                            }
                        textChunk = []
def createTempIndex(corpusChunk):
    global vocab
    global v
    global no_of_docs
    global total_tokens

    visited = set()
    coll_count = 0
    iteration = 0
    temp_ind = dict()
    doc_count = 0

    for collection in corpusChunk:
        print "In collection", collection
        currLine = ''
        blah = ''
        startDoc = False
        endDoc = False
        startText = False
        endText = False
        coll_count += 1

        with open(collection) as f:
            for line in f:
                if not startDoc:
                    match = re.findall(r'<DOC>', line)
                    if len(match) > 0:
                        if match[0] == '<DOC>':
                            startDoc = True
                            endText = False
                            textChunk = []

                if startDoc:
                    id_match = re.findall(r"<DOCNO>(.*?)</DOCNO>", line)
                    if len(id_match) > 0:
                        curr_doc_no = id_match[0].strip()
                        dict_id_val = DOC_ID_MAP[curr_doc_no]
                        no_of_docs += 1
                        doc_count += 1

                    start_text_match = re.findall(r"<TEXT>", line)
                    if len(start_text_match) > 0:
                        startText = True
                        endText = False

                    if startText and (not endText):
                        if not (line.strip() == "<TEXT>" or \
                                            line.strip() == "</TEXT>"):
                            currLine = line.strip()
                            ## Text normalize
                            currLine = currLine.lower()
                            currLine = util.removePunctuation(currLine)
                            currLine = unicode(currLine, 'utf-8')
                            currLine.decode("utf-8", 'ignore')
                            textChunk.append(currLine)

                    end_text_match = re.findall(r'</TEXT>', line)
                    if len(end_text_match) > 0:
                        startText = False
                        endText = True

                    end_match = re.findall(r"</DOC>", line)

                    if len(end_match) > 0:
                        endDoc = True
                        startDoc = False
                        tokens = re.findall(constants.TOKENIZING_REGEX, \
                                            " ".join(textChunk))
                        real_tokens = tokens
                        if constants.REMOVE_STOP_WORDS:
                            tokens = util.removeStopWords(tokens, constants.STOP_LIST)
                        current_doc_len = len(tokens)
                        doc_len_map.update({dict_id_val: current_doc_len})
                        total_tokens += current_doc_len

                        if constants.STEM_DATA:
                            visited = set()
                            stemmed_tokens = util.stemTokens(tokens)
                            for token, stemmed_token in zip(tokens, stemmed_tokens):
                                if token not in vocab:
                                    vocab.add(token)
                                    v.write(token + constants.ENDLINE)
                                if stemmed_token not in visited:
                                    visited.add(stemmed_token)
                                    term_positions = termPositions(stemmed_tokens, stemmed_token)
                                    if not temp_ind.get(stemmed_token):
                                        temp_ind[stemmed_token] = \
                                            [[dict_id_val, term_positions]]
                                    else:
                                        temp_ind[stemmed_token].append \
                                            ([dict_id_val, term_positions])
                        else:
                            visited = set()
                            for token in tokens:
                                if token not in vocab:
                                    vocab.add(token)
                                    v.write(token + constants.ENDLINE)
                                if token not in visited:
                                    visited.add(token)
                                    ##                                    tf = all_tfs[token]
                                    term_positions = termPositions(real_tokens, token)
                                    if not temp_ind.get(token):
                                        temp_ind[token] = \
                                            [[dict_id_val, term_positions]]
                                    else:
                                        temp_ind[token].append \
                                            ([dict_id_val, term_positions])
                        textChunk = []
                        if doc_count == 1000:
                            print f.tell()
    return temp_ind
Ejemplo n.º 8
0
    f = open(path, "r")
    #data = f.read()
    lines = f.readlines()
    for line in lines:
        wordList.append(line.strip())
    return wordList


path = os.getcwd() + '\\english2\\english2.txt'

wordList = readFromPath(path)
print(len(wordList))
input_string = "i love cricket, 'but utna ni like karta"
k = 5
input_wordList = input_string.split()
input_wordList = util.removePunctuation(input_wordList)
input_wordList = util.lemmatization(input_wordList)

for input_word in input_wordList:
    dict_suggestions = {}
    if input_word not in wordList:
        for word in wordList:
            dist = editDistance(input_word, word)
            dict_suggestions[word] = dist
        sorted_d = sorted(dict_suggestions.items(), key=operator.itemgetter(1))
        i = 0
        print("Suggestions for the word:", input_word)
        while i < k:
            print(sorted_d[i])
            i = i + 1