예제 #1
0
        myLastElements = LastEleCol["LastElements"]

        #Get the last value added to the database
        GetLastElements = myLastElements.find({})
        keywordsValue = GetLastElements[0]["keywords"]
        postingValue = GetLastElements[0]["posting"]

        #Reset the last Value of the keywords
        myquery = {"keywords": keywordsValue, "posting": postingValue}
        newvalues = {"$set": {"keywords": 1, "posting": 1}}
        myLastElements.update_one(myquery, newvalues)


def remove_dataBases():
    myclient.drop_database('Links')
    myclient.drop_database('DataA')


start = time.time()

remove_dataBases()

reset_last()

visited_unvisited()

ii.InvertedIndex()

end = time.time()
print(end - start)
예제 #2
0
import FetchDocument
import InvertedIndex
import ChampionList
import ScoreCalculator

if __name__ == '__main__':
    docs = FetchDocument.getAllDocuments2()
    inverted_index = InvertedIndex.InvertedIndex()
    for i in range(len(docs)):
        for j in range(len(docs[i].words)):
            inverted_index.add_id(docs[i].words[j], docs[i].doc_id)
#    inverted_index.print_all()
#    k = input("Please Enter K Value:\n")
    k = 10
#    r = input("Please Enter R Value:\n")
    r = 20
#    query = input("Please Enter your Query:\n")
    query = 'تراکتور'
    champion_list = ChampionList.champion_list_creator(inverted_index, docs, r)
#    champion_list.print_all()
    results = ScoreCalculator.score_calculator(champion_list, docs, query, k)
    print(results)
예제 #3
0
    # We are Processing in Alphabet Doc Category Order
    docs = list()
    docs.append(FetchDocument.getHealthDocuments())
    docs.append(FetchDocument.getHistoryDocuments())
    docs.append(FetchDocument.getMathDocuments())
    docs.append(FetchDocument.getPhysicsDocuments())
    docs.append(FetchDocument.getTechDocuments())

    print("docs appended")

    inverted_indexes = list()
    # full_inverted_index = InvertedIndex.InvertedIndex()
    full_docs = list()

    for i in range(5):
        inverted_indexes.append(InvertedIndex.InvertedIndex())

    print("Inverted Indexes Created")

    for i in range(len(docs)):
        for j in range(len(docs[i])):
            for k in range(len(docs[i][j].words)):
                inverted_indexes[i].add_id(docs[i][j].words[k],
                                           docs[i][j].doc_id)

    # inverted_indexes[0].print_all()
    # input("***")
    print("Full Inverted Index for All Categories")

    # for i in range(len(inverted_indexes)):
    #     full_inverted_index.merge(inverted_indexes[i].index_array)
예제 #4
0
import InvertedIndex
from cosineSim import cosineSim

fulldoc = path('C:\Users/admin/Documents/575/parser/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/S08/data/set1/a1.txt.clean').bytes()
docArray = [para for para in fulldoc.split('\n') if para.strip() != '']


print(len(docArray))

#print(docArray[4].split(' '))

print(stemText(docArray[4]))


tester = InvertedIndex.InvertedIndex()

for step in docArray:

    stemmed = stemText(step)

    tester.indexDocument(stemmed)

docTermMatrix = tester.createTermDocMatrix()

pd.set_option('display.max_columns', 150)
#print(termDocMatrix.head())

print('running query')
queryObj = Query(tester)
queryText = "james cook"
예제 #5
0
    tweetTokens = tknzr.tokenize(tweet)  # tokenize tweets
    tweetTokens = nltk.word_tokenize(tweet)

    tweetTokensCopy = []
    for word in tweetTokens:
        # word = re.sub("http(.*)","a",word) # remove links
        # word = re.sub("[0-9]*","a",word) #remove numbers
        # word = re.sub("\W+","a",word) #remove non-alphabet characters

        if word not in stopWordsList.values:  # only add to output non-stopwords
            tweetTokensCopy.append(word)
    tokenArray.append(tweetTokensCopy)  #add tweet tokens to output

#add all tweetID and tweets to the Inverted Index
print("adding to inverted index")
corpusInvertedIndex = InvertedIndex.InvertedIndex()
for i in range(len(tweetID)):
    corpusInvertedIndex.insertTokenList(tokenArray[i], tweetID[i])
print("vocabulary of Inverted Index is " +
      str(corpusInvertedIndex.vocabSize()))
print("Here is a sample size of words in the Inverted Index")
corpusInvertedIndex.tokenSample(100)
print("\n")

print("Testing queries")
##########
# STEP 4 #
##########
#write the top 1000 results

예제 #6
0
from Dijkstra import Dijkstra
from Graph import Graph
from JasonReader import *
from InvertedIndex import *

#读取json文件
json = loadDataFromFile('data/ini.json')

#初始化图
graph = Graph()
graph.iniGraph(json)

#初始化Dijkstra
dijk = Dijkstra()
#测试最短距离查询,返回最短距离及其路径
traversal_path, distince = dijk.minPath(graph, "2", "5")
#print(traversal_path,distince)
print("the shortest path is : %s distince = %s \n" %
      (','.join(traversal_path), str(distince)))

#初始化倒排索引
invert = InvertedIndex()
#按照条件查找,条件之间用','分隔,返回符合条件的顶点集合
print("sql res : ")
print(invert.excSQL("age=27,_type=person", graph))