def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSizeb" :len(types)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSize" :"HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/token" : int(100*len(types)/len(tokens))}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSizeb": len(types)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/token": int(100 * len(types) / len(tokens))}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/tokenb" :"HIGH" if len(types)/len(tokens) > .5 else "MEDIUM" if len(types)/len(tokens) > .2 else "LOW"}
def avgWordLength(text): tokens = Tokenize.byWord(text) sum = 0 count = 0 for token in tokens: if token.isalpha(): sum += len(token) count += 1 return {"AVG word Length": int(sum / count)}
def avgWordLength(text): tokens = Tokenize.byWord(text) sum = 0 count = 0 for token in tokens: if token.isalpha(): sum += len(token) count +=1 return {"AVG word Length" : int(sum/count)}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) fd = Ngrams.getNgramFreqDist(text,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["char#"+str(i)+" "+str(n)+"gramC"] = topM[i][0] return vector
def vocabSizeBucketed(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return { "vocabSize": "HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW" }
def typeTokenRatioBucketed(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return { "type/token": "HIGH" if len(types) / len(tokens) > .5 else "MEDIUM" if len(types) / len(tokens) > .2 else "LOW" }
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] fd = Ngrams.getNgramFreqDist(POStags,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["pos#"+str(i)+" "+str(n)+"gram"] = topM[i][0] return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) fd = Ngrams.getNgramFreqDist(text, n) topM = sorted([item for item in fd.items()], key=lambda x: x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["char#" + str(i) + " " + str(n) + "gramC"] = topM[i][0] return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return { "type/tokenb": "HIGH" if len(types) / len(tokens) > .5 else "MEDIUM" if len(types) / len(tokens) > .2 else "LOW" }
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return { "vocabSize": "HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW" }
def percentOfUpperLetters(text): text = " ".join(text) tokens = Tokenize.byWord(text) uppers = 0 total = 0 for c in text: if c.isupper(): uppers +=1 total += 1 percent = int(100*uppers/total) return {"percentUpperCase" : percent}
def percentOfUpperLetters(text): text = " ".join(text) tokens = Tokenize.byWord(text) uppers = 0 total = 0 for c in text: if c.isupper(): uppers += 1 total += 1 percent = int(100 * uppers / total) return {"percentUpperCase": percent}
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] fd = Ngrams.getNgramFreqDist(POStags, n) topM = sorted([item for item in fd.items()], key=lambda x: x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["pos#" + str(i) + " " + str(n) + "gram"] = topM[i][0] return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) fd = Ngrams.getNgramFreqDist(text,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1],reverse=True)[:m] #print(topM) total = 0 for p in topM: total += p[1] PDF = [] for p in topM: PDF.append((p[0],p[1]/total)) return dict(PDF[:m])
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) words=[] if stem: words = Tokenize.byWordStem(text) else: words = Tokenize.byWordAlphaOnly(text) fd = Ngrams.getNgramFreqDist(words,n) topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["word#"+str(i)+" "+str(n)+"gramW"] = topM[i][0] return vector
def percentOfLetters(text): text = " ".join(text) tokens = Tokenize.byWord(text) vector = {} total = 0 for i in range(26): vector["pL"+chr(i + ord('a'))] = 0 for c in text.lower(): if "pL"+c in vector.keys(): vector["pL"+c] +=1 total += 1 for i in range(26): vector["pL"+chr(i + ord('a'))] = int(100*(vector["pL"+chr(i + ord('a'))]/total)) return vector
def posDist(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] possibleTags = PerceptronTagger().model.classes vector = {} total = 0 for tag in possibleTags: vector[tag] = 0 for tag in POStags: vector[tag] += 1 total +=1 for tag in possibleTags: vector[tag] = int(100*vector[tag]/total) return vector
def featureNumericScore(sample): words = Tokenize.byWord(sample) HSWords = loadHSWords() sentimentWordCount = 0 score = 0 for w in words: for s in HSWords: if w == s["word"]: score += s["score"] sentimentWordCount +=1 #print("Raw score",score) score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1)) #rating = 5 if score > 2 else 4 if score > 1 else 3 if score > -2 else 2 if score > -3 else 1 #print("Ours:", rating, "Score", score) return {"HS raw score" : score}
def featureBinaryScore(sample): words = Tokenize.byWord(sample) HSWords = loadHSWords() sentimentWordCount = 0 score = 0 for w in words: for s in HSWords: if w == s["word"]: score += s["score"] sentimentWordCount +=1 #print("Raw score",score) score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1)) rating = "+" if score > 0 else "-" #print("Ours:", rating, "Score", score) return {"HS rating" : rating}
def posDist(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] possibleTags = PerceptronTagger().model.classes vector = {} total = 0 for tag in possibleTags: vector[tag] = 0 for tag in POStags: vector[tag] += 1 total += 1 for tag in possibleTags: vector[tag] = int(100 * vector[tag] / total) return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) fd = Ngrams.getNgramFreqDist(text, n) topM = sorted([item for item in fd.items()], key=lambda x: x[1], reverse=True)[:m] #print(topM) total = 0 for p in topM: total += p[1] PDF = [] for p in topM: PDF.append((p[0], p[1] / total)) return dict(PDF[:m])
def featureHitCountBucketed(sample): words = Tokenize.byWord(sample) HSWords = loadHSWords() sentimentWordCount = 0 score = 0 for w in words: for s in HSWords: if w == s["word"]: score += s["score"] sentimentWordCount +=1 #print("Raw score",score) score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1)) #rating = 5 if score > 2 else 4 if score > 1 else 3 if score > -2 else 2 if score > -3 else 1 #print("Ours:", rating, "Score", score) return {"HS hit count" : "HIGH" if sentimentWordCount > 8 else "MEDIUM" if sentimentWordCount > 4 else "LOW"}
def percentOfLetters(text): text = " ".join(text) tokens = Tokenize.byWord(text) vector = {} total = 0 for i in range(26): vector["pL" + chr(i + ord('a'))] = 0 for c in text.lower(): if "pL" + c in vector.keys(): vector["pL" + c] += 1 total += 1 for i in range(26): vector["pL" + chr(i + ord('a'))] = int( 100 * (vector["pL" + chr(i + ord('a'))] / total)) return vector
def feature(text): text = " ".join(text) tokens = Tokenize.byWord(text) words = [] if stem: words = Tokenize.byWordStem(text) else: words = Tokenize.byWordAlphaOnly(text) fd = Ngrams.getNgramFreqDist(words, n) topM = sorted([item for item in fd.items()], key=lambda x: x[1], reverse=True)[:m] vector = {} for i in range(len(topM)): vector["word#" + str(i) + " " + str(n) + "gramW"] = topM[i][0] return vector
def vocabSize(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSize" :len(types)}
def vocabSizeBucketed(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSize" :"HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"}
def typeTokenRatioBucketed(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/token" :"HIGH" if len(types)/len(tokens) > .5 else "MEDIUM" if len(types)/len(tokens) > .2 else "LOW"}
def textLength(text): text = " ".join(text) tokens = Tokenize.byWord(text) return {"text Length" : len(Tokenize.byWord(text))}
def vocabSize(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"vocabSize": len(types)}
def typeTokenRatio(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/token" : len(types)/len(tokens)}
def textLength(text): return {"text Length": len(Tokenize.byWord(text))}
def textLength(text): text = " ".join(text) tokens = Tokenize.byWord(text) return {"text Length": len(Tokenize.byWord(text))}
def textLength(text): return {"text Length" : len(Tokenize.byWord(text))}
def typeTokenRatio(text, lengthFilter=None): tokens = Tokenize.byWord(text) if lengthFilter != None: tokens = [token for token in tokens if len(token) >= lengthFilter] types = set(tokens) return {"type/token": len(types) / len(tokens)}