def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]    
     types = set(tokens)
     return {"vocabSizeb" :len(types)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]    
     types = set(tokens)
     return {"vocabSize" :"HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]    
     types = set(tokens)
     return {"type/token" : int(100*len(types)/len(tokens))}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]
     types = set(tokens)
     return {"vocabSizeb": len(types)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]
     types = set(tokens)
     return {"type/token": int(100 * len(types) / len(tokens))}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]    
     types = set(tokens)
     return {"type/tokenb" :"HIGH" if len(types)/len(tokens) > .5 else "MEDIUM" if len(types)/len(tokens) > .2 else "LOW"}
def avgWordLength(text):
    tokens = Tokenize.byWord(text)
    sum = 0
    count = 0
    for token in tokens:
        if token.isalpha():
            sum += len(token)
            count += 1
    return {"AVG word Length": int(sum / count)}
def avgWordLength(text):
    tokens = Tokenize.byWord(text)
    sum = 0
    count = 0
    for token in tokens:
        if token.isalpha():
            sum += len(token)
            count +=1
    return {"AVG word Length" : int(sum/count)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     fd = Ngrams.getNgramFreqDist(text,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["char#"+str(i)+" "+str(n)+"gramC"] = topM[i][0]
     return vector
예제 #10
0
def vocabSizeBucketed(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]
    types = set(tokens)
    return {
        "vocabSize":
        "HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"
    }
예제 #11
0
def typeTokenRatioBucketed(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]
    types = set(tokens)
    return {
        "type/token":
        "HIGH" if len(types) / len(tokens) > .5 else
        "MEDIUM" if len(types) / len(tokens) > .2 else "LOW"
    }
예제 #12
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
     fd = Ngrams.getNgramFreqDist(POStags,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["pos#"+str(i)+" "+str(n)+"gram"] = topM[i][0]
     return vector
예제 #13
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     fd = Ngrams.getNgramFreqDist(text, n)
     topM = sorted([item for item in fd.items()],
                   key=lambda x: x[1],
                   reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["char#" + str(i) + " " + str(n) + "gramC"] = topM[i][0]
     return vector
예제 #14
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]
     types = set(tokens)
     return {
         "type/tokenb":
         "HIGH" if len(types) / len(tokens) > .5 else
         "MEDIUM" if len(types) / len(tokens) > .2 else "LOW"
     }
예제 #15
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]
     types = set(tokens)
     return {
         "vocabSize":
         "HIGH"
         if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"
     }
예제 #16
0
def percentOfUpperLetters(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    uppers = 0
    total = 0    
    for c in text:
        if c.isupper():
            uppers +=1
        total += 1    
    percent = int(100*uppers/total)
    return {"percentUpperCase" : percent}
예제 #17
0
def percentOfUpperLetters(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    uppers = 0
    total = 0
    for c in text:
        if c.isupper():
            uppers += 1
        total += 1
    percent = int(100 * uppers / total)
    return {"percentUpperCase": percent}
예제 #18
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
     fd = Ngrams.getNgramFreqDist(POStags, n)
     topM = sorted([item for item in fd.items()],
                   key=lambda x: x[1],
                   reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["pos#" + str(i) + " " + str(n) + "gram"] = topM[i][0]
     return vector
예제 #19
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     fd = Ngrams.getNgramFreqDist(text,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1],reverse=True)[:m]
     #print(topM)
     total = 0
     for p in topM:
         total += p[1]
     PDF = []
     for p in topM:
         PDF.append((p[0],p[1]/total))
     return dict(PDF[:m])
예제 #20
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     words=[]
     if stem:
         words = Tokenize.byWordStem(text)
     else:
         words = Tokenize.byWordAlphaOnly(text)
     fd = Ngrams.getNgramFreqDist(words,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["word#"+str(i)+" "+str(n)+"gramW"] = topM[i][0]
     return vector
예제 #21
0
def percentOfLetters(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    vector = {}
    total = 0
    for i in range(26):
        vector["pL"+chr(i + ord('a'))] = 0
    for c in text.lower():
        if "pL"+c in vector.keys():
            vector["pL"+c] +=1
            total += 1
    for i in range(26):
        vector["pL"+chr(i + ord('a'))] = int(100*(vector["pL"+chr(i + ord('a'))]/total))
    return vector
예제 #22
0
def posDist(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
    possibleTags = PerceptronTagger().model.classes
    vector = {}
    total = 0
    for tag in possibleTags:
        vector[tag] = 0
    for tag in POStags:
        vector[tag] += 1
        total +=1
    for tag in possibleTags:
        vector[tag] = int(100*vector[tag]/total)
    return vector
예제 #23
0
def featureNumericScore(sample):
    words = Tokenize.byWord(sample)  
    HSWords = loadHSWords()
    sentimentWordCount = 0
    score = 0
    for w in words:
        for s in HSWords:
            if w == s["word"]:
                score += s["score"]
                sentimentWordCount +=1
    #print("Raw score",score)
    score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1))
    #rating = 5 if score > 2 else 4 if score > 1 else 3 if score > -2 else 2 if score > -3 else 1
    #print("Ours:", rating, "Score", score)
    return {"HS raw score" : score}
예제 #24
0
def featureBinaryScore(sample):
    words = Tokenize.byWord(sample)  
    HSWords = loadHSWords()
    sentimentWordCount = 0
    score = 0
    for w in words:
        for s in HSWords:
            if w == s["word"]:
                score += s["score"]
                sentimentWordCount +=1
    #print("Raw score",score)
    score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1))
    rating = "+" if score > 0 else "-"
    #print("Ours:", rating, "Score", score)
    return {"HS rating" : rating}
예제 #25
0
def posDist(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
    possibleTags = PerceptronTagger().model.classes
    vector = {}
    total = 0
    for tag in possibleTags:
        vector[tag] = 0
    for tag in POStags:
        vector[tag] += 1
        total += 1
    for tag in possibleTags:
        vector[tag] = int(100 * vector[tag] / total)
    return vector
예제 #26
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     fd = Ngrams.getNgramFreqDist(text, n)
     topM = sorted([item for item in fd.items()],
                   key=lambda x: x[1],
                   reverse=True)[:m]
     #print(topM)
     total = 0
     for p in topM:
         total += p[1]
     PDF = []
     for p in topM:
         PDF.append((p[0], p[1] / total))
     return dict(PDF[:m])
예제 #27
0
def featureHitCountBucketed(sample):
    words = Tokenize.byWord(sample)  
    HSWords = loadHSWords()
    sentimentWordCount = 0
    score = 0
    for w in words:
        for s in HSWords:
            if w == s["word"]:
                score += s["score"]
                sentimentWordCount +=1
    #print("Raw score",score)
    score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1))
    #rating = 5 if score > 2 else 4 if score > 1 else 3 if score > -2 else 2 if score > -3 else 1
    #print("Ours:", rating, "Score", score)
    return {"HS hit count" : "HIGH" if sentimentWordCount > 8 else "MEDIUM" if sentimentWordCount > 4 else "LOW"}
예제 #28
0
def percentOfLetters(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    vector = {}
    total = 0
    for i in range(26):
        vector["pL" + chr(i + ord('a'))] = 0
    for c in text.lower():
        if "pL" + c in vector.keys():
            vector["pL" + c] += 1
            total += 1
    for i in range(26):
        vector["pL" + chr(i + ord('a'))] = int(
            100 * (vector["pL" + chr(i + ord('a'))] / total))
    return vector
예제 #29
0
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     words = []
     if stem:
         words = Tokenize.byWordStem(text)
     else:
         words = Tokenize.byWordAlphaOnly(text)
     fd = Ngrams.getNgramFreqDist(words, n)
     topM = sorted([item for item in fd.items()],
                   key=lambda x: x[1],
                   reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["word#" + str(i) + " " + str(n) + "gramW"] = topM[i][0]
     return vector
예제 #30
0
def vocabSize(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]    
    types = set(tokens)
    return {"vocabSize" :len(types)}
예제 #31
0
def vocabSizeBucketed(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]    
    types = set(tokens)
    return {"vocabSize" :"HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"}
예제 #32
0
def typeTokenRatioBucketed(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]    
    types = set(tokens)
    return {"type/token" :"HIGH" if len(types)/len(tokens) > .5 else "MEDIUM" if len(types)/len(tokens) > .2 else "LOW"}
예제 #33
0
def textLength(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    return {"text Length" : len(Tokenize.byWord(text))}
예제 #34
0
def vocabSize(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]
    types = set(tokens)
    return {"vocabSize": len(types)}
예제 #35
0
def typeTokenRatio(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]    
    types = set(tokens)
    return {"type/token" : len(types)/len(tokens)}
예제 #36
0
def textLength(text):
    return {"text Length": len(Tokenize.byWord(text))}
예제 #37
0
def textLength(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    return {"text Length": len(Tokenize.byWord(text))}
예제 #38
0
def textLength(text):
    return {"text Length" : len(Tokenize.byWord(text))}
예제 #39
0
def typeTokenRatio(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]
    types = set(tokens)
    return {"type/token": len(types) / len(tokens)}