myLastElements = LastEleCol["LastElements"] #Get the last value added to the database GetLastElements = myLastElements.find({}) keywordsValue = GetLastElements[0]["keywords"] postingValue = GetLastElements[0]["posting"] #Reset the last Value of the keywords myquery = {"keywords": keywordsValue, "posting": postingValue} newvalues = {"$set": {"keywords": 1, "posting": 1}} myLastElements.update_one(myquery, newvalues) def remove_dataBases(): myclient.drop_database('Links') myclient.drop_database('DataA') start = time.time() remove_dataBases() reset_last() visited_unvisited() ii.InvertedIndex() end = time.time() print(end - start)
import FetchDocument import InvertedIndex import ChampionList import ScoreCalculator if __name__ == '__main__': docs = FetchDocument.getAllDocuments2() inverted_index = InvertedIndex.InvertedIndex() for i in range(len(docs)): for j in range(len(docs[i].words)): inverted_index.add_id(docs[i].words[j], docs[i].doc_id) # inverted_index.print_all() # k = input("Please Enter K Value:\n") k = 10 # r = input("Please Enter R Value:\n") r = 20 # query = input("Please Enter your Query:\n") query = 'تراکتور' champion_list = ChampionList.champion_list_creator(inverted_index, docs, r) # champion_list.print_all() results = ScoreCalculator.score_calculator(champion_list, docs, query, k) print(results)
# We are Processing in Alphabet Doc Category Order docs = list() docs.append(FetchDocument.getHealthDocuments()) docs.append(FetchDocument.getHistoryDocuments()) docs.append(FetchDocument.getMathDocuments()) docs.append(FetchDocument.getPhysicsDocuments()) docs.append(FetchDocument.getTechDocuments()) print("docs appended") inverted_indexes = list() # full_inverted_index = InvertedIndex.InvertedIndex() full_docs = list() for i in range(5): inverted_indexes.append(InvertedIndex.InvertedIndex()) print("Inverted Indexes Created") for i in range(len(docs)): for j in range(len(docs[i])): for k in range(len(docs[i][j].words)): inverted_indexes[i].add_id(docs[i][j].words[k], docs[i][j].doc_id) # inverted_indexes[0].print_all() # input("***") print("Full Inverted Index for All Categories") # for i in range(len(inverted_indexes)): # full_inverted_index.merge(inverted_indexes[i].index_array)
import InvertedIndex from cosineSim import cosineSim fulldoc = path('C:\Users/admin/Documents/575/parser/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/S08/data/set1/a1.txt.clean').bytes() docArray = [para for para in fulldoc.split('\n') if para.strip() != ''] print(len(docArray)) #print(docArray[4].split(' ')) print(stemText(docArray[4])) tester = InvertedIndex.InvertedIndex() for step in docArray: stemmed = stemText(step) tester.indexDocument(stemmed) docTermMatrix = tester.createTermDocMatrix() pd.set_option('display.max_columns', 150) #print(termDocMatrix.head()) print('running query') queryObj = Query(tester) queryText = "james cook"
tweetTokens = tknzr.tokenize(tweet) # tokenize tweets tweetTokens = nltk.word_tokenize(tweet) tweetTokensCopy = [] for word in tweetTokens: # word = re.sub("http(.*)","a",word) # remove links # word = re.sub("[0-9]*","a",word) #remove numbers # word = re.sub("\W+","a",word) #remove non-alphabet characters if word not in stopWordsList.values: # only add to output non-stopwords tweetTokensCopy.append(word) tokenArray.append(tweetTokensCopy) #add tweet tokens to output #add all tweetID and tweets to the Inverted Index print("adding to inverted index") corpusInvertedIndex = InvertedIndex.InvertedIndex() for i in range(len(tweetID)): corpusInvertedIndex.insertTokenList(tokenArray[i], tweetID[i]) print("vocabulary of Inverted Index is " + str(corpusInvertedIndex.vocabSize())) print("Here is a sample size of words in the Inverted Index") corpusInvertedIndex.tokenSample(100) print("\n") print("Testing queries") ########## # STEP 4 # ########## #write the top 1000 results
from Dijkstra import Dijkstra from Graph import Graph from JasonReader import * from InvertedIndex import * #读取json文件 json = loadDataFromFile('data/ini.json') #初始化图 graph = Graph() graph.iniGraph(json) #初始化Dijkstra dijk = Dijkstra() #测试最短距离查询,返回最短距离及其路径 traversal_path, distince = dijk.minPath(graph, "2", "5") #print(traversal_path,distince) print("the shortest path is : %s distince = %s \n" % (','.join(traversal_path), str(distince))) #初始化倒排索引 invert = InvertedIndex() #按照条件查找,条件之间用','分隔,返回符合条件的顶点集合 print("sql res : ") print(invert.excSQL("age=27,_type=person", graph))