Exemple #1
0
    previous_key = ''
    for key, value in sorted_token_pairs:

        if previous_key != key:
            previous_key = key
            dictionary_[previous_key] = [value]
        else:
            if value not in dictionary_[previous_key]:
                dictionary_[previous_key].append(value)
    for key in dictionary_:
        value = dictionary_[key]
        dictionary_[key] = (len(value), value)

    return dictionary_


if __name__ == "__main__":
    # Standalone Test
    fileList = ListFiles(rootDir)
    tokenPair = []
    for i in range(2):
        tokenPair = tokenPair + GenerateTokens(GetFileContents(fileList[i]),
                                               fileList[i])

    original_list = LingModule(tokenPair)
    sorted_list = SortTokens(original_list)
    # print(sorted_list)
    transformed_postings = TransformationIntoPostings(sorted_list)
    print(transformed_postings)
Exemple #2
0
# Input to this component will be a concatenated list of tokens from all documents.
# Input: list of pairs < token , document id >
# Output: sorted list of pairs < token , document id >

from CONST import *
from Directory_Listing import ListFiles
from File_Reading import GetFileContents
from Tokenization import GenerateTokens
from Linguistic_Modules import LingModule


def SortTokens(token_pairs_list):
    sortedTokens = sorted(token_pairs_list,
                          key=lambda element: (element[0], element[1]))

    return sortedTokens


if __name__ == "__main__":
    # Standalone Test
    fileList = ListFiles(rootDir)
    fileContent_1 = GetFileContents(fileList[0])
    tokenPair_1 = GenerateTokens(fileContent_1, fileList[0])
    fileContent_2 = GetFileContents(fileList[1])
    tokenPair_2 = GenerateTokens(fileContent_2, fileList[1])

    tokenPair = tokenPair_1 + tokenPair_2
    original_list = LingModule(tokenPair)
    print(original_list)
    sorted_list = SortTokens(original_list)
    print(sorted_list)
Exemple #3
0
if __name__ == "__main__":

    pid = os.getpid()
    pyname = os.path.basename(__file__)

    start = time.time()

    # Directory Listing
    all_files = ListFiles(rootDir)
    #m_s = GetMemory(pid,pyname)
    dic_tree = Dic_Tree()

    for file in all_files:
        # File Reading
        file_text = GetFileContents(file)
        tokens = file_text.split()
        dic_tree.insert_many([LingStr(item) for item in tokens], file)

    del tokens
    del file_text

    #m_e = GetMemory(pid,pyname)
    time_index = (time.time() - start) * 1000

    print("Time for creating index:\t", time_index, "ms")
    #print("Memory for the index:\t", m_e - m_s, "KB")
    while True:
        query = input()
        q_start = time.time()
        queries = query.split()
portStemmer = PorterStemmer()
snowStemmer = SnowballStemmer("english")


def LingStr(token):
    tmp = re.sub(r'[\^\[\]\-\\!@#$%&*()_=+`~":;|/.,?{}<>\']', '',
                 str(token).lower())
    tmp = portStemmer.stem(str(tmp))
    tmp = snowStemmer.stem(str(tmp))
    return tmp


def LingModule(tokenPair):
    pairList = []
    for token, docID in tokenPair:
        tmp = LingStr(token)
        if tmp != "":
            pairList.append((tmp, docID))

    return pairList


if __name__ == "__main__":
    # Standalone test
    fileList = ListFiles(rootDir)
    fileContent = GetFileContents(fileList[0])
    tokenPair = GenerateTokens(fileContent, fileList[0])

    outputPair = LingModule(tokenPair)
    print(outputPair)
# Tokenization
# Input: text (file contents), string (document id = path to file)
# Output: list of pairs < string (token) , string (document id) >
import itertools

from CONST import *
from File_Reading import GetFileContents
import re


def GenerateTokens(contents, filePath):
    tokens = contents.split()
    return list(zip(tokens, itertools.repeat(filePath)))


if __name__ == '__main__':
    # Standalone Test
    filePath = rootDir + "1.txt"
    contents = GetFileContents(filePath)
    tokens = GenerateTokens(contents, filePath)
    print(contents)
    print("tokens")
    print(tokens)
Exemple #6
0
#from Get_Memory_Req import GetMemory

if __name__ == "__main__":

    pid = os.getpid()
    pyname = os.path.basename(__file__)

    start = time.time()

    # Directory Listing
    all_files = ListFiles(rootDir)
    all_token_pairs = []

    for file in all_files:
        # File Reading
        file_text = GetFileContents(file)
        # Tokenization
        token_pairs = GenerateTokens(file_text, file)
        # Linguistic Modules
        modified_token_pairs = LingModule(token_pairs)
        all_token_pairs += modified_token_pairs

    del file_text
    del token_pairs
    del modified_token_pairs
    # Sorting the Tokens
    sorted_tokens = SortTokens(all_token_pairs)
    del all_token_pairs
    # Transformation into Postings
    #m_s = GetMemory(pid,pyname)