Example #1
0
 def __init__(self, sentenceStr, position):
     self.string = sentenceStr
     # Lowercase the first word
     if sentenceStr[0].isupper():
         letters = list(sentenceStr)
         letters[0] = letters[0].lower()
         sentenceStr = "".join(letters)
     tokens = word_tokenise(sentenceStr)
     bow = defaultdict(int)
     for token in tokens:
         term = standardise(token)
         if term:
             hashed = hash(term)
             bow[hashed] += 1
     self.bagOfWords = bow
     self.position = position
Example #2
0
 def loadWeights(n):
     dfLoc = localPath("wiki_doc_freqs_trim.dat")
     # Read in the document freqs.
     # Have to do this first because we collapse some freqs through
     # standardisation.
     weights = defaultdict(int)
     for line in utf8open(dfLoc):
         term, freq = line.split("\t")
         term = standardise(term)
         if term:
             weights[hash(term)] += int(freq)
     # Turn the frequencies into IDF weights.
     for term, freq in weights.items():
         idf = log(n / freq, 10)
         weights[term] = int(idf)
     IDFWeightedDocument.weights = weights
Example #3
0
 def _loadIDFs(self, n):
     dfLoc = localPath('wiki_doc_freqs_trim.dat')
     dfs = collections.defaultdict(int)
     # Convenience for codecs.open.
     lines = utf8open(dfLoc).read().strip().split('\n')
     # Read in the document freqs.
     # Have to do this first because we collapse some freqs
     # through standardisation.
     for line in lines:
         token, freq = line.split('\t')
         token = standardise(token)
         if token:
             dfs[token] += int(freq)
     # Turn the frequencies into IDF weights.
     idfs = collections.defaultdict(float)
     for token, freq in dfs.items():
         idf = log(n/freq, 10)
         idfs[token] = idf
     return idfs