def remove_common_words_with_known_words(index_array: InvertedIndex):
    known_words = [
        "از", "با", "به", "برای", "تا", "در", "که", "ازای", "یا", "پس", "اگر",
        "اما", "زیرا", "لکن", "لیکن", "را", "نیز", "ولی", "هم", "بی", "بدون"
    ]
    for x in known_words:
        index_array.remove_word(x)
Exemple #2
0
 def spimi(self, index, tokeniser=None):
     """ Implements SPIMI index construction algorithm """
     if tokeniser is None:
         tokeniser = tk.Tokeniser()
     numberofblocks = ( len(self.fileList) + self.block - 1 ) // self.block
     if numberofblocks < 1:
         numberofblocks = 1
     for n in range( numberofblocks ):
         index.clear()
         for doc in self.fileList[n*self.block: (n * self.block) + self.block]:
             self.parse(doc, index, tokeniser)
         ii.save("index/index"+str(n)+".csv", index)
     ii.mergeFile( "index/fullindex.csv", [ "index/index"+str(n)+".csv" for n in range(numberofblocks) ] )
     self.save()
Exemple #3
0
def loadFiles(fileName):
    invertedIndex = InvertedIndex.InvertedIndex()

    currentDir = os.getcwd()

    workingDir = os.getcwd()

    questionsDir = workingDir + "/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/"

    os.chdir(questionsDir)
    for sDir in glob.glob("S*"):
        dataDir = questionsDir+sDir+"/data/"
        os.chdir(dataDir)
        print(sDir)
        for set in glob.glob("set*"):
            os.chdir(dataDir+set)
            print(set)
            for file in glob.glob("*.clean"):
                fullFileName = dataDir+set+"/"+file
                print(fullFileName)
                stemmedFile = stemText(path(file).text(encoding="utf8"))
                print("File stemmed")
                invertedIndex.indexDocument(stemmedFile, fullFileName)
                print("File added to index")

    os.chdir(currentDir)

    invertedIndex.save(fileName)
def DocIndex():
    '''Imports Inverted Index file and finds the number of documents the word has occurred'''

    global TotalFiles
    global FeatureDict
    TempDict = {}

    try:

        FeatureWordPath = os.getcwd()
        FeatureWordPath += '\\FeatureWords.txt'
        featurefile = open(FeatureWordPath, 'w')

        for wrd in FeatureDict:
            wrd = wrd.strip()
            featurefile.write(wrd + '\n')

        try:
            InvertedIndex.main(sys.argv[1])
        except:
            print 'Error in the Inverted Index File'
            raise

        path = os.getcwd()
        path += '\\IndexDocument.txt'
        files = open(path, 'r').readlines()

        for Line in files:
            temp = re.split(' ', Line)
            Word = str(temp[0])
            Freq = float(temp[1].strip())
            TempDict[Word] = Freq

        for word in TempDict:
            if word in FeatureDict:
                try:
                    idf = float(
                        math.log(float(TotalFiles) / float(TempDict[word])))
                    FeatureDict[word][19] = str(idf)
                except:
                    print word
                    pass
    except:
        print 'File Not Found'
        raise
def DocIndex():
    '''Imports Inverted Index file and finds the number of documents the word has occurred'''
    
    global TotalFiles
    global FeatureDict
    TempDict={}

    try:
        
        
        FeatureWordPath=os.getcwd()
        FeatureWordPath+='\\FeatureWords.txt'
        featurefile=open(FeatureWordPath,'w')

        for wrd in FeatureDict:
            wrd=wrd.strip()
            featurefile.write(wrd+'\n')

        try:
            InvertedIndex.main(sys.argv[1])
        except:
            print 'Error in the Inverted Index File'
            raise
        
        path=os.getcwd()    
        path+='\\IndexDocument.txt'
        files=open(path,'r').readlines()

        for Line in files:
            temp=re.split(' ',Line)
            Word=str(temp[0])
            Freq=float(temp[1].strip())
            TempDict[Word]=Freq
        
        for word in TempDict:
            if word in FeatureDict:
                try:
                    idf=float(math.log(float(TotalFiles)/float(TempDict[word])))
                    FeatureDict[word][19]=str(idf)
                except:
                    print word
                    pass
    except:
        print 'File Not Found'
        raise
Exemple #6
0
def champion_list_creator(invertedIndex: InvertedIndex, docs, r):
    result = InvertedIndex.InvertedIndex()
    for i in range(len(invertedIndex.index_array)):
        word = invertedIndex.index_array[i].word
        arr = invertedIndex.index_array[i].doc_ids
        temp_res = []
        for j in range(len(arr)):
            id_location = Document.documents_binary_search(
                docs, 0,
                len(docs) - 1, arr[j])
            weight = WeightCalculator.weight_calculator_doc(
                word, docs, docs[id_location])
            temp_res.append([arr[j], weight])
        res = sorter(temp_res, r)
        index = InvertedIndex.Index(word)
        index.set_docs_id(res)
        result.index_array.append(index)
    return result
Exemple #7
0
 def __init__(self):
     self.tmp_ii = InvertedIndex.tmp_ii()
     self.titleout = open('title.txt', 'w')
     self.textout = open('text.txt', 'w')
     self.in_page = False
     self.in_title = False
     self.in_revision = False
     self.in_text = False
     self.title = None
     self.text = None
Exemple #8
0
    def showDocumentText(self, documentsByTermProximity, distanceFromTerm):
        """Show text that may answer the query from the top 3 most relevant documents determined with cosine similarity to query"""

        #for termPairProximity in documentsByTermProximity:
        totalSections = 0

        print("Documents Retrived: " + str(len(documentsByTermProximity)))

        blurbInvertedIndex = InvertedIndex.InvertedIndex()
        blurbList = []

        for doc in documentsByTermProximity:  #termPairProximity:

            docId = doc[0]

            sectionsFound = (len(doc[1]) / 2)

            documentFile = self.invertedIndex.listOfFiles[docId - 1]
            rawDocumentText = Path(documentFile).text(encoding='utf8')
            documentText = [
                word.lower().replace('\n', '')
                for word in rawDocumentText.split(' ') if word.strip() != ''
            ]

            i = 0

            while i < len(doc[1]) - 1:
                positionA = doc[1][i]
                positionB = doc[1][i + 1]

                if positionA - distanceFromTerm > 0:
                    positionA = positionA - distanceFromTerm
                else:
                    positionA = 0

                termsInDoc = ' '.join(
                    documentText[positionA:(positionB + distanceFromTerm)])

                blurbList.append(termsInDoc)

                blurbInvertedIndex.indexDocument(stemText(termsInDoc), '')

                i += 2

            totalSections += sectionsFound

        print("Sections Retrieved: " + str(totalSections))

        blurbMatrix = blurbInvertedIndex.createTermDocMatrix()
        bestBlurbs = self.getKNearestDocs(self.query, blurbMatrix, 3)

        print("Showing Best 3 Results:")
        for blurbNum in bestBlurbs:
            print(blurbNum)
            print(blurbList[blurbNum - 1])
Exemple #9
0
def token_to_postings_list(document_id, token, position, ii_buffer):
    # DBよりトークンIDの取得
    (token_id, docs_count) = wiserdb.db_get_token_id(token)

    # 既存のテンポラリ転置インデックスがある
    if ii_buffer is not None:
        ii_entry = find_token_from_index(token_id, ii_buffer)
    else:
        ii_entry = None

    if ii_entry is not None:
        # トークンに紐づく既存のポスティングリストがある
        pl = ii_entry.postings_list
        exist = False
        for pl_entry in pl:
            # 既にそのトークンを含む文書がある
            if pl_entry.document_id == document_id:
                # copied = copy.copy(pl_entry)
                pl_entry.positions_count += 1
                pl_entry.positions.append(position)
                exist = True
                break
        # そのトークンが初登場の文書
        if not exist:
            pl_entry = InvertedIndex.PostingsListEntry(document_id, 1,
                                                       position)
            pl.append(pl_entry)
            ii_entry.docs_count += 1
        ii_entry.positions_count += 1
        ii_entry.postings_list = pl
    else:
        # 紐づく既存のポスティングリストがない
        pl_entry = InvertedIndex.PostingsListEntry(document_id, 1)
        pl_entry.positions.append(position)
        ii_entry = InvertedIndex.InvertedIndexEntry(token_id, docs_count, 1)
        ii_entry.postings_list.append(pl_entry)
        ii_buffer.append(ii_entry)

    return ii_buffer
def driverInvertedobject():

    #load stopwords
    stopwords = StopWordManger("stopwords")

    #load data
    Invertobj = InvertedIndex("data/ap89_collection.html",
                              stopwords.list_stopwords)

    #load QuerysList
    Qlist = QueryList("querys/query_list.txt", stopwords.list_stopwords,
                      'results_file')

    # Run querys
    Qlist.runQuerylist(Invertobj.dict_terms, Invertobj.dict_docs)
Exemple #11
0
    def __init__(self):
        # configure logging, databases (alienvault + osvdb)
        self.logger = Logger.logger
        _CONF = OssimConf()
        _OSVDB = "osvdb"
        self._DB = OssimDB(_CONF[VAR_DB_HOST], _CONF[VAR_DB_SCHEMA],
                           _CONF[VAR_DB_USER], _CONF[VAR_DB_PASSWORD])

        self._osvdb = OssimDB(_CONF[VAR_DB_HOST], _OSVDB, _CONF[VAR_DB_USER],
                              _CONF[VAR_DB_PASSWORD])
        # connect databases
        avbool = self._DB.connect()
        osvbool = self._osvdb.connect()
        if not avbool:
            self.logger.error(
                "[vcad][x] error connecting to database alienvault")
        if not osvbool:
            self.logger.error("[vcad][x] error connecting to database osvdb")
        self.invertedIndex = InvertedIndex.InvertedIndex()
        self.logger.info("[vcad][+] init complete.")
        return
Exemple #12
0
def calculateCosSim(documents, query, inverted, maxtf, doclength):
    top50 = defaultdict(list)

    querywords = InvertedIndex.processText(query)
    qf = dict()
    # get query frequencies
    for index, word in querywords:
        qf[word] = qf.get(word, 0) + 1

    # find the maximum query frequency for each query
    maxqf = 0
    for word, f in qf.items():
        if maxqf < f:
            maxqf = f

    # get documents which has atleast one query word and find CosSim weight between each document and query
    cosSimWeight = dict()
    cosSim = dict()
    for index, word in querywords:
        docs = dict()
        if word in inverted:
            docs = inverted[word]
            df = len(docs)
            for docid, tf in docs.items():
                docweight = (tf / maxtf[docid]) * log2(len(documents) / df)
                queryweight = (qf[word] / maxqf) * log2(len(documents) / df)
                totalweight = docweight * queryweight
                cosSimWeight[docid] = cosSimWeight.get(docid, 0) + totalweight

    # calculate CosSim by dividing CosSim weight by doclength
    for docid, weight in cosSimWeight.items():
        cosSim[docid] = cosSimWeight[docid] / doclength[docid]

    # map query with retrieved ranked documents
    for docid in sorted(cosSim, key=cosSim.get, reverse=True)[:20]:
        top50[query].append(docid)

    return top50
def score_calculator(invertedIndex: InvertedIndex, docs, query, k):
    query_doc = Document.Document(query, 1)
    query_score = query_score_calculator(query_doc, docs)
    query_words = query_doc.words
    query_words = sorted(query_words)
    index_res = []
    for i in range(len(query_words)):
        loc = invertedIndex.find_word(query_words[i])
        if loc == -1:
            print("word " + query_words[i] + " can't be found in List!")
        else:
            index_res.append(invertedIndex.index_array[loc])
    docs_score = docs_score_calculator(index_res, docs)
    max_heap = CosinusCalculator.cosinus_max_heap_creator(
        query_score, docs_score)
    res = []
    if len(max_heap.heap) > 1 and len(max_heap.heap) > k:
        for i in range(k):
            res.append(max_heap.pop())
    elif 1 < len(max_heap.heap) < k + 1:
        for i in range(len(max_heap.heap)):
            res.append(max_heap.pop())
    return res
Exemple #14
0
        myLastElements = LastEleCol["LastElements"]

        #Get the last value added to the database
        GetLastElements = myLastElements.find({})
        keywordsValue = GetLastElements[0]["keywords"]
        postingValue = GetLastElements[0]["posting"]

        #Reset the last Value of the keywords
        myquery = {"keywords": keywordsValue, "posting": postingValue}
        newvalues = {"$set": {"keywords": 1, "posting": 1}}
        myLastElements.update_one(myquery, newvalues)


def remove_dataBases():
    myclient.drop_database('Links')
    myclient.drop_database('DataA')


start = time.time()

remove_dataBases()

reset_last()

visited_unvisited()

ii.InvertedIndex()

end = time.time()
print(end - start)
Exemple #15
0
import FetchDocument
import InvertedIndex
import ChampionList
import ScoreCalculator

if __name__ == '__main__':
    docs = FetchDocument.getAllDocuments2()
    inverted_index = InvertedIndex.InvertedIndex()
    for i in range(len(docs)):
        for j in range(len(docs[i].words)):
            inverted_index.add_id(docs[i].words[j], docs[i].doc_id)
#    inverted_index.print_all()
#    k = input("Please Enter K Value:\n")
    k = 10
#    r = input("Please Enter R Value:\n")
    r = 20
#    query = input("Please Enter your Query:\n")
    query = 'تراکتور'
    champion_list = ChampionList.champion_list_creator(inverted_index, docs, r)
#    champion_list.print_all()
    results = ScoreCalculator.score_calculator(champion_list, docs, query, k)
    print(results)
Exemple #16
0
    # We are Processing in Alphabet Doc Category Order
    docs = list()
    docs.append(FetchDocument.getHealthDocuments())
    docs.append(FetchDocument.getHistoryDocuments())
    docs.append(FetchDocument.getMathDocuments())
    docs.append(FetchDocument.getPhysicsDocuments())
    docs.append(FetchDocument.getTechDocuments())

    print("docs appended")

    inverted_indexes = list()
    # full_inverted_index = InvertedIndex.InvertedIndex()
    full_docs = list()

    for i in range(5):
        inverted_indexes.append(InvertedIndex.InvertedIndex())

    print("Inverted Indexes Created")

    for i in range(len(docs)):
        for j in range(len(docs[i])):
            for k in range(len(docs[i][j].words)):
                inverted_indexes[i].add_id(docs[i][j].words[k],
                                           docs[i][j].doc_id)

    # inverted_indexes[0].print_all()
    # input("***")
    print("Full Inverted Index for All Categories")

    # for i in range(len(inverted_indexes)):
    #     full_inverted_index.merge(inverted_indexes[i].index_array)
    tweetTokens = tknzr.tokenize(tweet)  # tokenize tweets
    tweetTokens = nltk.word_tokenize(tweet)

    tweetTokensCopy = []
    for word in tweetTokens:
        # word = re.sub("http(.*)","a",word) # remove links
        # word = re.sub("[0-9]*","a",word) #remove numbers
        # word = re.sub("\W+","a",word) #remove non-alphabet characters

        if word not in stopWordsList.values:  # only add to output non-stopwords
            tweetTokensCopy.append(word)
    tokenArray.append(tweetTokensCopy)  #add tweet tokens to output

#add all tweetID and tweets to the Inverted Index
print("adding to inverted index")
corpusInvertedIndex = InvertedIndex.InvertedIndex()
for i in range(len(tweetID)):
    corpusInvertedIndex.insertTokenList(tokenArray[i], tweetID[i])
print("vocabulary of Inverted Index is " +
      str(corpusInvertedIndex.vocabSize()))
print("Here is a sample size of words in the Inverted Index")
corpusInvertedIndex.tokenSample(100)
print("\n")

print("Testing queries")
##########
# STEP 4 #
##########
#write the top 1000 results

Exemple #18
0
        print('Crawling web pages')
        print(
            '..............................................................................'
        )
        Crawler.crawler(3000, 'https://www.cs.uic.edu')
        print(
            '..............................................................................'
        )
        print('Crawling web pages completed successfully')
        print(
            '..............................................................................'
        )

    # get document details from collection
    documents = InvertedIndex.documentsList(htmlpagesdirectory,
                                            crawledlistdirectory)

    # Build Inverted-Index for documents
    if (input_option == '1' or input_option == '2'):
        print('Calculating inverted index')
        print(
            '..............................................................................'
        )
        inverted = InvertedIndex.getInvertedIndex(documents)
        inverted_index_path = pathlib.Path.cwd().joinpath(
            'Inverted', 'inverted.csv')

        dir = pathlib.Path.cwd().joinpath("Inverted").is_dir()
        if dir != True:
            os.mkdir(str(pathlib.Path.cwd().joinpath("Inverted")))
        else:
Exemple #19
0
import InvertedIndex
import InvertedIndexQuery

i = InvertedIndex.Index()

filename = '/home/mimi/Desktop/RI tp/D1.txt'
file_to_index = open(filename).read()
document_key = filename

# index the document, using document_key as the document's
# id.
i.index(file_to_index, document_key)
'''
    filename = 'document2.txt'
    file_to_index = open(filename).read()
    document_key = filename

    i.index(file_to_index, document_key)

    search_results = InvertedIndexQuery.query('Python and spam', i)
    search_results.sort()

    cnt = 0
    for document in search_results:
      cnt = cnt + 1
      print '%d) %s' % (cnt, document[1])
      '''
Exemple #20
0
import InvertedIndex
from cosineSim import cosineSim

fulldoc = path('C:\Users/admin/Documents/575/parser/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/S08/data/set1/a1.txt.clean').bytes()
docArray = [para for para in fulldoc.split('\n') if para.strip() != '']


print(len(docArray))

#print(docArray[4].split(' '))

print(stemText(docArray[4]))


tester = InvertedIndex.InvertedIndex()

for step in docArray:

    stemmed = stemText(step)

    tester.indexDocument(stemmed)

docTermMatrix = tester.createTermDocMatrix()

pd.set_option('display.max_columns', 150)
#print(termDocMatrix.head())

print('running query')
queryObj = Query(tester)
queryText = "james cook"
Exemple #21
0
        print >>sys.stderr, "for help use --help"
        return 2

    # Index construction
    index       = ii.InvertedIndex()
    indexer     = ri.ReuterIndexer(folder=docfolder, blockSize=blocksize)
    tokeniser   = tk.Tokeniser(stopList=stopwords, useNumberFilter=usenumberfilter, 
                               useCaseFolding=usecasefolding, useStemming=usestemming)
    
    if reconstruct:
        print "Building inverted index..."
        indexer.spimi(index, tokeniser)
        ri.save("index/doclength.csv", indexer.docL)
    else:
        print "Loading inverted index..."
        ii.load("index/fullindex.csv", index)
        ri.load("index/doclength.csv", indexer.docL)

    if not query:
        return 0
    
    bm25        = ok.OkapiRanking(index, indexer)
    rsvK = 1.2
    rsvB = 0.75
    topN = 10

    # Start of query loop
    helpString = """
    Enter a search query below:
                    
    <digit> : Display document with given ID
Exemple #22
0
from Dijkstra import Dijkstra
from Graph import Graph
from JasonReader import *
from InvertedIndex import *

#读取json文件
json = loadDataFromFile('data/ini.json')

#初始化图
graph = Graph()
graph.iniGraph(json)

#初始化Dijkstra
dijk = Dijkstra()
#测试最短距离查询,返回最短距离及其路径
traversal_path, distince = dijk.minPath(graph, "2", "5")
#print(traversal_path,distince)
print("the shortest path is : %s distince = %s \n" %
      (','.join(traversal_path), str(distince)))

#初始化倒排索引
invert = InvertedIndex()
#按照条件查找,条件之间用','分隔,返回符合条件的顶点集合
print("sql res : ")
print(invert.excSQL("age=27,_type=person", graph))