Esempio n. 1
0
def createDocumentVector(fileObj, tfRawD):
    fileContent = FileReader(fileObj).content
    fileContent = ''.join(map(lambda x: ' ' if x != '.' and ord(x) < 48 or 57 < ord(x) < 65 else x, list(fileContent)))
    words = fileContent.strip().split()
    length = len(words)
    uniqueWordsInDoc = set()
    for word in words:
        word = modify_word(word)
        if word:
            uniqueWordsInDoc.add(word)
            if word not in tfRawD:
                tfRawD[word] = 1
            else:
                tfRawD[word] += 1
    return uniqueWordsInDoc, words, length, tfRawD
Esempio n. 2
0
def createTopicVector(fileObj, T, tfRaw):
    fileContent = FileReader(fileObj).content
    fileContent = ''.join(map(lambda x: ' ' if x != '.' and ord(x) < 48 or 57 < ord(x) < 65 else x, list(fileContent)))
    words = fileContent.strip().split()
    length = len(words)
    DVector = set()
    T.extend(words)
    for word in words:
        word = modify_word(word)
        if word:
            DVector.add(word)
            if word not in tfRaw:
                tfRaw[word] = 1
            else:
                tfRaw[word] += 1
    return DVector, length
Esempio n. 3
0
def createDocumentVector(fileObj, tfRawD):
    fileContent = FileReader(fileObj).content
    fileContent = ''.join(
        map(
            lambda x: ' '
            if x != '.' and ord(x) < 48 or 57 < ord(x) < 65 else x,
            list(fileContent)))
    words = fileContent.strip().split()
    length = len(words)
    uniqueWordsInDoc = set()
    for word in words:
        word = modify_word(word)
        if word:
            uniqueWordsInDoc.add(word)
            if word not in tfRawD:
                tfRawD[word] = 1
            else:
                tfRawD[word] += 1
    return uniqueWordsInDoc, words, length, tfRawD
Esempio n. 4
0
def createTopicVector(fileObj, T, tfRaw):
    fileContent = FileReader(fileObj).content
    fileContent = ''.join(
        map(
            lambda x: ' '
            if x != '.' and ord(x) < 48 or 57 < ord(x) < 65 else x,
            list(fileContent)))
    words = fileContent.strip().split()
    length = len(words)
    DVector = set()
    T.extend(words)
    for word in words:
        word = modify_word(word)
        if word:
            DVector.add(word)
            if word not in tfRaw:
                tfRaw[word] = 1
            else:
                tfRaw[word] += 1
    return DVector, length