def __init__(self, sentenceStr, position): self.string = sentenceStr # Lowercase the first word if sentenceStr[0].isupper(): letters = list(sentenceStr) letters[0] = letters[0].lower() sentenceStr = "".join(letters) tokens = word_tokenise(sentenceStr) bow = defaultdict(int) for token in tokens: term = standardise(token) if term: hashed = hash(term) bow[hashed] += 1 self.bagOfWords = bow self.position = position
def loadWeights(n): dfLoc = localPath("wiki_doc_freqs_trim.dat") # Read in the document freqs. # Have to do this first because we collapse some freqs through # standardisation. weights = defaultdict(int) for line in utf8open(dfLoc): term, freq = line.split("\t") term = standardise(term) if term: weights[hash(term)] += int(freq) # Turn the frequencies into IDF weights. for term, freq in weights.items(): idf = log(n / freq, 10) weights[term] = int(idf) IDFWeightedDocument.weights = weights
def _loadIDFs(self, n): dfLoc = localPath('wiki_doc_freqs_trim.dat') dfs = collections.defaultdict(int) # Convenience for codecs.open. lines = utf8open(dfLoc).read().strip().split('\n') # Read in the document freqs. # Have to do this first because we collapse some freqs # through standardisation. for line in lines: token, freq = line.split('\t') token = standardise(token) if token: dfs[token] += int(freq) # Turn the frequencies into IDF weights. idfs = collections.defaultdict(float) for token, freq in dfs.items(): idf = log(n/freq, 10) idfs[token] = idf return idfs