def remove_common_words_with_known_words(index_array: InvertedIndex): known_words = [ "از", "با", "به", "برای", "تا", "در", "که", "ازای", "یا", "پس", "اگر", "اما", "زیرا", "لکن", "لیکن", "را", "نیز", "ولی", "هم", "بی", "بدون" ] for x in known_words: index_array.remove_word(x)
def spimi(self, index, tokeniser=None): """ Implements SPIMI index construction algorithm """ if tokeniser is None: tokeniser = tk.Tokeniser() numberofblocks = ( len(self.fileList) + self.block - 1 ) // self.block if numberofblocks < 1: numberofblocks = 1 for n in range( numberofblocks ): index.clear() for doc in self.fileList[n*self.block: (n * self.block) + self.block]: self.parse(doc, index, tokeniser) ii.save("index/index"+str(n)+".csv", index) ii.mergeFile( "index/fullindex.csv", [ "index/index"+str(n)+".csv" for n in range(numberofblocks) ] ) self.save()
def loadFiles(fileName): invertedIndex = InvertedIndex.InvertedIndex() currentDir = os.getcwd() workingDir = os.getcwd() questionsDir = workingDir + "/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/" os.chdir(questionsDir) for sDir in glob.glob("S*"): dataDir = questionsDir+sDir+"/data/" os.chdir(dataDir) print(sDir) for set in glob.glob("set*"): os.chdir(dataDir+set) print(set) for file in glob.glob("*.clean"): fullFileName = dataDir+set+"/"+file print(fullFileName) stemmedFile = stemText(path(file).text(encoding="utf8")) print("File stemmed") invertedIndex.indexDocument(stemmedFile, fullFileName) print("File added to index") os.chdir(currentDir) invertedIndex.save(fileName)
def DocIndex(): '''Imports Inverted Index file and finds the number of documents the word has occurred''' global TotalFiles global FeatureDict TempDict = {} try: FeatureWordPath = os.getcwd() FeatureWordPath += '\\FeatureWords.txt' featurefile = open(FeatureWordPath, 'w') for wrd in FeatureDict: wrd = wrd.strip() featurefile.write(wrd + '\n') try: InvertedIndex.main(sys.argv[1]) except: print 'Error in the Inverted Index File' raise path = os.getcwd() path += '\\IndexDocument.txt' files = open(path, 'r').readlines() for Line in files: temp = re.split(' ', Line) Word = str(temp[0]) Freq = float(temp[1].strip()) TempDict[Word] = Freq for word in TempDict: if word in FeatureDict: try: idf = float( math.log(float(TotalFiles) / float(TempDict[word]))) FeatureDict[word][19] = str(idf) except: print word pass except: print 'File Not Found' raise
def DocIndex(): '''Imports Inverted Index file and finds the number of documents the word has occurred''' global TotalFiles global FeatureDict TempDict={} try: FeatureWordPath=os.getcwd() FeatureWordPath+='\\FeatureWords.txt' featurefile=open(FeatureWordPath,'w') for wrd in FeatureDict: wrd=wrd.strip() featurefile.write(wrd+'\n') try: InvertedIndex.main(sys.argv[1]) except: print 'Error in the Inverted Index File' raise path=os.getcwd() path+='\\IndexDocument.txt' files=open(path,'r').readlines() for Line in files: temp=re.split(' ',Line) Word=str(temp[0]) Freq=float(temp[1].strip()) TempDict[Word]=Freq for word in TempDict: if word in FeatureDict: try: idf=float(math.log(float(TotalFiles)/float(TempDict[word]))) FeatureDict[word][19]=str(idf) except: print word pass except: print 'File Not Found' raise
def champion_list_creator(invertedIndex: InvertedIndex, docs, r): result = InvertedIndex.InvertedIndex() for i in range(len(invertedIndex.index_array)): word = invertedIndex.index_array[i].word arr = invertedIndex.index_array[i].doc_ids temp_res = [] for j in range(len(arr)): id_location = Document.documents_binary_search( docs, 0, len(docs) - 1, arr[j]) weight = WeightCalculator.weight_calculator_doc( word, docs, docs[id_location]) temp_res.append([arr[j], weight]) res = sorter(temp_res, r) index = InvertedIndex.Index(word) index.set_docs_id(res) result.index_array.append(index) return result
def __init__(self): self.tmp_ii = InvertedIndex.tmp_ii() self.titleout = open('title.txt', 'w') self.textout = open('text.txt', 'w') self.in_page = False self.in_title = False self.in_revision = False self.in_text = False self.title = None self.text = None
def showDocumentText(self, documentsByTermProximity, distanceFromTerm): """Show text that may answer the query from the top 3 most relevant documents determined with cosine similarity to query""" #for termPairProximity in documentsByTermProximity: totalSections = 0 print("Documents Retrived: " + str(len(documentsByTermProximity))) blurbInvertedIndex = InvertedIndex.InvertedIndex() blurbList = [] for doc in documentsByTermProximity: #termPairProximity: docId = doc[0] sectionsFound = (len(doc[1]) / 2) documentFile = self.invertedIndex.listOfFiles[docId - 1] rawDocumentText = Path(documentFile).text(encoding='utf8') documentText = [ word.lower().replace('\n', '') for word in rawDocumentText.split(' ') if word.strip() != '' ] i = 0 while i < len(doc[1]) - 1: positionA = doc[1][i] positionB = doc[1][i + 1] if positionA - distanceFromTerm > 0: positionA = positionA - distanceFromTerm else: positionA = 0 termsInDoc = ' '.join( documentText[positionA:(positionB + distanceFromTerm)]) blurbList.append(termsInDoc) blurbInvertedIndex.indexDocument(stemText(termsInDoc), '') i += 2 totalSections += sectionsFound print("Sections Retrieved: " + str(totalSections)) blurbMatrix = blurbInvertedIndex.createTermDocMatrix() bestBlurbs = self.getKNearestDocs(self.query, blurbMatrix, 3) print("Showing Best 3 Results:") for blurbNum in bestBlurbs: print(blurbNum) print(blurbList[blurbNum - 1])
def token_to_postings_list(document_id, token, position, ii_buffer): # DBよりトークンIDの取得 (token_id, docs_count) = wiserdb.db_get_token_id(token) # 既存のテンポラリ転置インデックスがある if ii_buffer is not None: ii_entry = find_token_from_index(token_id, ii_buffer) else: ii_entry = None if ii_entry is not None: # トークンに紐づく既存のポスティングリストがある pl = ii_entry.postings_list exist = False for pl_entry in pl: # 既にそのトークンを含む文書がある if pl_entry.document_id == document_id: # copied = copy.copy(pl_entry) pl_entry.positions_count += 1 pl_entry.positions.append(position) exist = True break # そのトークンが初登場の文書 if not exist: pl_entry = InvertedIndex.PostingsListEntry(document_id, 1, position) pl.append(pl_entry) ii_entry.docs_count += 1 ii_entry.positions_count += 1 ii_entry.postings_list = pl else: # 紐づく既存のポスティングリストがない pl_entry = InvertedIndex.PostingsListEntry(document_id, 1) pl_entry.positions.append(position) ii_entry = InvertedIndex.InvertedIndexEntry(token_id, docs_count, 1) ii_entry.postings_list.append(pl_entry) ii_buffer.append(ii_entry) return ii_buffer
def driverInvertedobject(): #load stopwords stopwords = StopWordManger("stopwords") #load data Invertobj = InvertedIndex("data/ap89_collection.html", stopwords.list_stopwords) #load QuerysList Qlist = QueryList("querys/query_list.txt", stopwords.list_stopwords, 'results_file') # Run querys Qlist.runQuerylist(Invertobj.dict_terms, Invertobj.dict_docs)
def __init__(self): # configure logging, databases (alienvault + osvdb) self.logger = Logger.logger _CONF = OssimConf() _OSVDB = "osvdb" self._DB = OssimDB(_CONF[VAR_DB_HOST], _CONF[VAR_DB_SCHEMA], _CONF[VAR_DB_USER], _CONF[VAR_DB_PASSWORD]) self._osvdb = OssimDB(_CONF[VAR_DB_HOST], _OSVDB, _CONF[VAR_DB_USER], _CONF[VAR_DB_PASSWORD]) # connect databases avbool = self._DB.connect() osvbool = self._osvdb.connect() if not avbool: self.logger.error( "[vcad][x] error connecting to database alienvault") if not osvbool: self.logger.error("[vcad][x] error connecting to database osvdb") self.invertedIndex = InvertedIndex.InvertedIndex() self.logger.info("[vcad][+] init complete.") return
def calculateCosSim(documents, query, inverted, maxtf, doclength): top50 = defaultdict(list) querywords = InvertedIndex.processText(query) qf = dict() # get query frequencies for index, word in querywords: qf[word] = qf.get(word, 0) + 1 # find the maximum query frequency for each query maxqf = 0 for word, f in qf.items(): if maxqf < f: maxqf = f # get documents which has atleast one query word and find CosSim weight between each document and query cosSimWeight = dict() cosSim = dict() for index, word in querywords: docs = dict() if word in inverted: docs = inverted[word] df = len(docs) for docid, tf in docs.items(): docweight = (tf / maxtf[docid]) * log2(len(documents) / df) queryweight = (qf[word] / maxqf) * log2(len(documents) / df) totalweight = docweight * queryweight cosSimWeight[docid] = cosSimWeight.get(docid, 0) + totalweight # calculate CosSim by dividing CosSim weight by doclength for docid, weight in cosSimWeight.items(): cosSim[docid] = cosSimWeight[docid] / doclength[docid] # map query with retrieved ranked documents for docid in sorted(cosSim, key=cosSim.get, reverse=True)[:20]: top50[query].append(docid) return top50
def score_calculator(invertedIndex: InvertedIndex, docs, query, k): query_doc = Document.Document(query, 1) query_score = query_score_calculator(query_doc, docs) query_words = query_doc.words query_words = sorted(query_words) index_res = [] for i in range(len(query_words)): loc = invertedIndex.find_word(query_words[i]) if loc == -1: print("word " + query_words[i] + " can't be found in List!") else: index_res.append(invertedIndex.index_array[loc]) docs_score = docs_score_calculator(index_res, docs) max_heap = CosinusCalculator.cosinus_max_heap_creator( query_score, docs_score) res = [] if len(max_heap.heap) > 1 and len(max_heap.heap) > k: for i in range(k): res.append(max_heap.pop()) elif 1 < len(max_heap.heap) < k + 1: for i in range(len(max_heap.heap)): res.append(max_heap.pop()) return res
myLastElements = LastEleCol["LastElements"] #Get the last value added to the database GetLastElements = myLastElements.find({}) keywordsValue = GetLastElements[0]["keywords"] postingValue = GetLastElements[0]["posting"] #Reset the last Value of the keywords myquery = {"keywords": keywordsValue, "posting": postingValue} newvalues = {"$set": {"keywords": 1, "posting": 1}} myLastElements.update_one(myquery, newvalues) def remove_dataBases(): myclient.drop_database('Links') myclient.drop_database('DataA') start = time.time() remove_dataBases() reset_last() visited_unvisited() ii.InvertedIndex() end = time.time() print(end - start)
import FetchDocument import InvertedIndex import ChampionList import ScoreCalculator if __name__ == '__main__': docs = FetchDocument.getAllDocuments2() inverted_index = InvertedIndex.InvertedIndex() for i in range(len(docs)): for j in range(len(docs[i].words)): inverted_index.add_id(docs[i].words[j], docs[i].doc_id) # inverted_index.print_all() # k = input("Please Enter K Value:\n") k = 10 # r = input("Please Enter R Value:\n") r = 20 # query = input("Please Enter your Query:\n") query = 'تراکتور' champion_list = ChampionList.champion_list_creator(inverted_index, docs, r) # champion_list.print_all() results = ScoreCalculator.score_calculator(champion_list, docs, query, k) print(results)
# We are Processing in Alphabet Doc Category Order docs = list() docs.append(FetchDocument.getHealthDocuments()) docs.append(FetchDocument.getHistoryDocuments()) docs.append(FetchDocument.getMathDocuments()) docs.append(FetchDocument.getPhysicsDocuments()) docs.append(FetchDocument.getTechDocuments()) print("docs appended") inverted_indexes = list() # full_inverted_index = InvertedIndex.InvertedIndex() full_docs = list() for i in range(5): inverted_indexes.append(InvertedIndex.InvertedIndex()) print("Inverted Indexes Created") for i in range(len(docs)): for j in range(len(docs[i])): for k in range(len(docs[i][j].words)): inverted_indexes[i].add_id(docs[i][j].words[k], docs[i][j].doc_id) # inverted_indexes[0].print_all() # input("***") print("Full Inverted Index for All Categories") # for i in range(len(inverted_indexes)): # full_inverted_index.merge(inverted_indexes[i].index_array)
tweetTokens = tknzr.tokenize(tweet) # tokenize tweets tweetTokens = nltk.word_tokenize(tweet) tweetTokensCopy = [] for word in tweetTokens: # word = re.sub("http(.*)","a",word) # remove links # word = re.sub("[0-9]*","a",word) #remove numbers # word = re.sub("\W+","a",word) #remove non-alphabet characters if word not in stopWordsList.values: # only add to output non-stopwords tweetTokensCopy.append(word) tokenArray.append(tweetTokensCopy) #add tweet tokens to output #add all tweetID and tweets to the Inverted Index print("adding to inverted index") corpusInvertedIndex = InvertedIndex.InvertedIndex() for i in range(len(tweetID)): corpusInvertedIndex.insertTokenList(tokenArray[i], tweetID[i]) print("vocabulary of Inverted Index is " + str(corpusInvertedIndex.vocabSize())) print("Here is a sample size of words in the Inverted Index") corpusInvertedIndex.tokenSample(100) print("\n") print("Testing queries") ########## # STEP 4 # ########## #write the top 1000 results
print('Crawling web pages') print( '..............................................................................' ) Crawler.crawler(3000, 'https://www.cs.uic.edu') print( '..............................................................................' ) print('Crawling web pages completed successfully') print( '..............................................................................' ) # get document details from collection documents = InvertedIndex.documentsList(htmlpagesdirectory, crawledlistdirectory) # Build Inverted-Index for documents if (input_option == '1' or input_option == '2'): print('Calculating inverted index') print( '..............................................................................' ) inverted = InvertedIndex.getInvertedIndex(documents) inverted_index_path = pathlib.Path.cwd().joinpath( 'Inverted', 'inverted.csv') dir = pathlib.Path.cwd().joinpath("Inverted").is_dir() if dir != True: os.mkdir(str(pathlib.Path.cwd().joinpath("Inverted"))) else:
import InvertedIndex import InvertedIndexQuery i = InvertedIndex.Index() filename = '/home/mimi/Desktop/RI tp/D1.txt' file_to_index = open(filename).read() document_key = filename # index the document, using document_key as the document's # id. i.index(file_to_index, document_key) ''' filename = 'document2.txt' file_to_index = open(filename).read() document_key = filename i.index(file_to_index, document_key) search_results = InvertedIndexQuery.query('Python and spam', i) search_results.sort() cnt = 0 for document in search_results: cnt = cnt + 1 print '%d) %s' % (cnt, document[1]) '''
import InvertedIndex from cosineSim import cosineSim fulldoc = path('C:\Users/admin/Documents/575/parser/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/S08/data/set1/a1.txt.clean').bytes() docArray = [para for para in fulldoc.split('\n') if para.strip() != ''] print(len(docArray)) #print(docArray[4].split(' ')) print(stemText(docArray[4])) tester = InvertedIndex.InvertedIndex() for step in docArray: stemmed = stemText(step) tester.indexDocument(stemmed) docTermMatrix = tester.createTermDocMatrix() pd.set_option('display.max_columns', 150) #print(termDocMatrix.head()) print('running query') queryObj = Query(tester) queryText = "james cook"
print >>sys.stderr, "for help use --help" return 2 # Index construction index = ii.InvertedIndex() indexer = ri.ReuterIndexer(folder=docfolder, blockSize=blocksize) tokeniser = tk.Tokeniser(stopList=stopwords, useNumberFilter=usenumberfilter, useCaseFolding=usecasefolding, useStemming=usestemming) if reconstruct: print "Building inverted index..." indexer.spimi(index, tokeniser) ri.save("index/doclength.csv", indexer.docL) else: print "Loading inverted index..." ii.load("index/fullindex.csv", index) ri.load("index/doclength.csv", indexer.docL) if not query: return 0 bm25 = ok.OkapiRanking(index, indexer) rsvK = 1.2 rsvB = 0.75 topN = 10 # Start of query loop helpString = """ Enter a search query below: <digit> : Display document with given ID
from Dijkstra import Dijkstra from Graph import Graph from JasonReader import * from InvertedIndex import * #读取json文件 json = loadDataFromFile('data/ini.json') #初始化图 graph = Graph() graph.iniGraph(json) #初始化Dijkstra dijk = Dijkstra() #测试最短距离查询,返回最短距离及其路径 traversal_path, distince = dijk.minPath(graph, "2", "5") #print(traversal_path,distince) print("the shortest path is : %s distince = %s \n" % (','.join(traversal_path), str(distince))) #初始化倒排索引 invert = InvertedIndex() #按照条件查找,条件之间用','分隔,返回符合条件的顶点集合 print("sql res : ") print(invert.excSQL("age=27,_type=person", graph))