def avgWordLengthBucketed(text): tokens = Tokenize.byWordAlphaOnly(text) sum = 0 count = 0 for token in tokens: sum += len(token) count +=1 numericValue = int(sum/count) bucketLabel = "Long" if numericValue > 6 else "Medium" if numericValue > 4 else "Short" return {"AVG word Length" : bucketLabel}
def avgWordLengthBucketed(text): tokens = Tokenize.byWordAlphaOnly(text) sum = 0 count = 0 for token in tokens: sum += len(token) count += 1 numericValue = int(sum / count) bucketLabel = "Long" if numericValue > 6 else "Medium" if numericValue > 4 else "Short" return {"AVG word Length": bucketLabel}
def avgWordLength(text): text = " ".join(text) tokens = Tokenize.byWordAlphaOnly(text) sum = 0 count = 0 tokens = list(set(tokens)) for token in tokens: if token.isalpha(): sum += len(token) count += 1 return {"AVG word Length": int(sum / count)}
def avgWordLength(text): text = " ".join(text) tokens = Tokenize.byWordAlphaOnly(text) sum = 0 count = 0 tokens = list(set(tokens)) for token in tokens: if token.isalpha(): sum += len(token) count +=1 return {"AVG word Length" : int(sum/count)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) words=[] if stem: words = Tokenize.byWordStem(text) else: words = Tokenize.byWordAlphaOnly(text) fd = Ngrams.getNgramFreqDist(words,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["word#"+str(i)+" "+str(n)+"gramW"] = topM[i][0] return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) words = [] if stem: words = Tokenize.byWordStem(text) else: words = Tokenize.byWordAlphaOnly(text) fd = Ngrams.getNgramFreqDist(words, n) topM = sorted([item for item in fd.items()], key=lambda x: x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["word#" + str(i) + " " + str(n) + "gramW"] = topM[i][0] return vector
def wordLengthDist(text): text = " ".join(text) words = Tokenize.byWordAlphaOnly(text) vector = {} total = 0 for i in range(1,11): vector["%ofwords"+str(i)+"long"] = 0 count = 0 words = list(set(words)) for word in words: if len(word) < 10: vector["%ofwords"+str(len(word))+"long"] += 1 else: vector["%ofwords"+str(10)+"long"] += 1 total +=1 for i in range(1,11): vector["%ofwords"+str(i)+"long"] = int(100*vector["%ofwords"+str(i)+"long"]/total) return vector
def wordLengthDist(text): text = " ".join(text) words = Tokenize.byWordAlphaOnly(text) vector = {} total = 0 for i in range(1, 11): vector["%ofwords" + str(i) + "long"] = 0 count = 0 words = list(set(words)) for word in words: if len(word) < 10: vector["%ofwords" + str(len(word)) + "long"] += 1 else: vector["%ofwords" + str(10) + "long"] += 1 total += 1 for i in range(1, 11): vector["%ofwords" + str(i) + "long"] = int( 100 * vector["%ofwords" + str(i) + "long"] / total) return vector