Beispiel #1
0
    def test_tokenzie(self):
        text = \
"""("hi, I am Wenjing"  #t  #f result cond 123
"""
        results = list(Tokenize.tokenize(text, 0))
        self.assertEqual(len(results), 7)
        self.assertTokenEqual(results[0], Token(Tokens.LPAREN, 0))
        self.assertTokenEqual(results[1], Token(Tokens.STRING, 1, "hi, I am Wenjing"))
        self.assertTokenEqual(results[2], Token(Tokens.TRUE, 21))
        self.assertTokenEqual(results[3], Token(Tokens.FALSE, 25))
        self.assertTokenEqual(results[4], Token(Tokens.VARIABLE, 28, "result"))
        self.assertTokenEqual(results[5], Token(Tokens.COND, 35, ))
        self.assertTokenEqual(results[6], Token(Tokens.NUMBER, 40, 123))

        text = "(('))"
        results = list(Tokenize.tokenize(text, 0))
        self.assertEqual(len(results), 5)
        self.assertTokenEqual(results[0], Token(Tokens.LPAREN, 0))
        self.assertTokenEqual(results[1], Token(Tokens.LPAREN, 1))
        self.assertTokenEqual(results[2], Token(Tokens.QUOTE, 2))
        self.assertTokenEqual(results[3], Token(Tokens.RPAREN, 3))
        self.assertTokenEqual(results[4], Token(Tokens.RPAREN, 4))

        text = """(cond "hello" 12)"""
        results = list(Tokenize.tokenize(text, 0))
        self.assertEqual(len(results), 5)
        self.assertTokenEqual(results[0], Token(Tokens.LPAREN, 0))
        self.assertTokenEqual(results[1], Token(Tokens.COND, 1))
        self.assertTokenEqual(results[2], Token(Tokens.STRING, 6, "hello"))
        self.assertTokenEqual(results[3], Token(Tokens.NUMBER, 14, 12))
        self.assertTokenEqual(results[4], Token(Tokens.RPAREN, 16))
Beispiel #2
0
    def test_skip(self):
        text = "\n  ;This line is leave blank\n  ;code will begin\n  (cons 1 2)"
        pos = Tokenize._skip(text, 0)
        self.assertEqual(text[pos:], "(cons 1 2)")

        #another test case
        text = "Nil"
        pos = Tokenize._skip(text, 0)
        self.assertEqual(text[pos:], "Nil")
Beispiel #3
0
def isCreditCard(text):
    credit = Tokenize.nGram(text, "credit card")
    ticket = Tokenize.nGram(text, "bank ticket")
    if credit > ticket:
        return True
    elif ticket > credit:
        return False
    else:
        return None
Beispiel #4
0
    def test_skipToNewLine(self):
        textWith3Lines = "line1\nline2\nline3"
        pos = Tokenize._skipToNextLine(textWith3Lines, 0)
        self.assertEqual(textWith3Lines[pos:], "line2\nline3")

        pos = Tokenize._skipToNextLine(textWith3Lines, pos)
        self.assertEqual(textWith3Lines[pos:], "line3")

        pos = Tokenize._skipToNextLine(textWith3Lines, pos)
        self.assertEqual(textWith3Lines[pos:], "")
Beispiel #5
0
    def test_skipWhiteSpaces(self):
        # text with leading spaces
        text = "    \n\r\t    \n\r\t  text"
        pos = Tokenize._skipWhitespaces(text, 0)
        self.assertEqual("text", text[pos:])

        # text with no leading space
        test = "text"
        pos = Tokenize._skipWhitespaces(text, 0)
        self.assertEqual("text", text[pos:])
Beispiel #6
0
def lemmatize_sentence(sentence):
    stop_words = stopwords.words('english')
    tokens = Tokenize.tokenize_word(sentence)
    without_stopwords = [word.lower() for word in tokens if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for token, tag in nltk.pos_tag(without_stopwords):
        mapped_tag = Tokenize.tag_map(tag[0])
        lemma = lemmatizer.lemmatize(token, mapped_tag)
        lemmas.append(lemma)
    return lemmas
Beispiel #7
0
def searchGame(text, corpus):

    text = Tokenize.tratarTexto(text)
    gameList = {}
    searchList = corpusTrat
    for key in searchList:
        searchList[key]["coe"] = Tokenize.coeficienteSimilaridade(
            text, searchList[key]["text"], searchList)
        if searchList[key]["coe"] > 0:
            gameList[key] = searchList[key]["coe"]
    return gameList
Beispiel #8
0
    def test_literal_expression(self):
        tokens = Tokenize.tokenize("'(cons a b) 'c", 0)
        result = list(parse(tokens))

        partialTokens = Tokenize.tokenize("(cons a b)", 0)
        partialResult = list(parse(partialTokens))

        self.assertEqual(result[0][1].valueType, Values.LITERAL)
        self.assertEqual(result[0][1].val, partialResult[0][1])

        self.assertEqual(result[1][1].valueType, Values.LITERAL)
        self.assertEqual(result[1][1].val, makeSymbol("c"))
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     words=[]
     if stem:
         words = Tokenize.byWordStem(text)
     else:
         words = Tokenize.byWordAlphaOnly(text)
     fd = Ngrams.getNgramFreqDist(words,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["word#"+str(i)+" "+str(n)+"gramW"] = topM[i][0]
     return vector
Beispiel #10
0
 def answer(self, text):
     if not Tokenize.valid_exp_date(text):
         print("Enter a valid expiration date. Format mm/yy")
         return CardExpirationTimeState()
     order.creditCard.holder = text
     print("What is the name of the holder of the Credit Card?")
     return CardholderState()
Beispiel #11
0
 def answer(self, text):
     if not Tokenize.valid_card_number(text):
         print("Enter a valid credit card number.")
         return CreditCardNumberState()
     order.creditCard.expirationDate = text
     print("What is the expiration date of the Credit Card? Format mm/yy")
     return CardExpirationTimeState()
Beispiel #12
0
 def answer(self, text):
     if not Tokenize.valid_cpf(text):
         print("Invalid CPF. Please enter again with only number.")
         return CPFState()
     order.cpf = text
     print("How do you want to pay? Credit card or bank ticket?")
     return PaymentMethodState()
Beispiel #13
0
 def answer(self, text):
     if not Tokenize.valid_email(text):
         print("Invalid email. Please enter a valid email.")
         return UserEmailState()
     order.email = text
     print("What's your CPF? Enter only numbers.")
     return CPFState()
Beispiel #14
0
 def execute(self, sourceCode):
     tokens = Tokenize.tokenize(sourceCode, 0)
     for status, exp in parse(tokens):
         if status != ParseError.OK:
             print "Error occurs:", status
             return
         return Eval.eval(exp, self.glob)[1]
Beispiel #15
0
 def test_TokenizeKeyWords(self):
     wrong = ["some","of","these","words","are","not","keywords"]
     keywords = ["make","if","else","return","class","method"]
     input = wrong + keywords
     random.shuffle(input)
     output = [ word for word in input if Tokenize.TokenizeKeywords(word)]
     self.assertItemsEqual(output,keywords,"find the keywords")
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]    
     types = set(tokens)
     return {"vocabSize" :"HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]    
     types = set(tokens)
     return {"vocabSizeb" :len(types)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]    
     types = set(tokens)
     return {"type/tokenb" :"HIGH" if len(types)/len(tokens) > .5 else "MEDIUM" if len(types)/len(tokens) > .2 else "LOW"}
Beispiel #19
0
 def test_TokenizeOperators(self):
     wrong = ["these","are","not","operators","#","$","_"]
     opperators = ["+","-","*","=","+=","-=",">","<","!=",">=","<="]
     input = wrong + opperators
     random.shuffle(input)
     output = [word for word in input if Tokenize.TokenizeOperators(word)]
     self.assertItemsEqual(output,opperators,"find the operators")
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     words = []
     if stem:
         words = Tokenize.byWordStem(text)
     else:
         words = Tokenize.byWordAlphaOnly(text)
     fd = Ngrams.getNgramFreqDist(words, n)
     topM = sorted([item for item in fd.items()],
                   key=lambda x: x[1],
                   reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["word#" + str(i) + " " + str(n) + "gramW"] = topM[i][0]
     return vector
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]
     types = set(tokens)
     return {"vocabSizeb": len(types)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]
     types = set(tokens)
     return {"type/token": int(100 * len(types) / len(tokens))}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]    
     types = set(tokens)
     return {"type/token" : int(100*len(types)/len(tokens))}
Beispiel #24
0
 def test_Digits(self):
     wrong = ["these","are","not","digits","0.0.0"]
     digits = ["0","-1","3","9.0",".9","100000000"]
     input = wrong + digits
     random.shuffle(input)
     output = [ word for word in input if Tokenize.TokenizeDigits(word) ]
     self.assertItemsEqual(output,digits,"find digits")
def avgWordLength(text):
    tokens = Tokenize.byWord(text)
    sum = 0
    count = 0
    for token in tokens:
        if token.isalpha():
            sum += len(token)
            count +=1
    return {"AVG word Length" : int(sum/count)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     fd = Ngrams.getNgramFreqDist(text,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["char#"+str(i)+" "+str(n)+"gramC"] = topM[i][0]
     return vector
Beispiel #27
0
 def test_TokenizeStrings(self):
     strings = ["\"string with spaces\"","\"stringWithNoSpaces\""]
     opperators = ["+","-","*","=","+=","-=",">","<","!=",">=","<="]
     keywords = ["make","if","else","return","class","method"]
     invalids = ["2er4",",sdf","@sd"]
     input = strings + opperators + keywords + invalids
     random.shuffle(input)
     output = [word for word in input if Tokenize.TokenizeStrings(word)]
     self.assertItemsEqual(output,strings,"find strings")
def vocabSizeBucketed(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]
    types = set(tokens)
    return {
        "vocabSize":
        "HIGH" if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"
    }
Beispiel #29
0
    def test_getToken(self):
        text = "(cons 1 2)"
        result = Tokenize._extractToken(text, 0)

        self.assertIsNotNone(result)
        self.assertEqual(result[1], 1)
        self.assertTokenEqual(result[0], Token(Tokens.LPAREN, 0))

        """
def avgWordLength(text):
    tokens = Tokenize.byWord(text)
    sum = 0
    count = 0
    for token in tokens:
        if token.isalpha():
            sum += len(token)
            count += 1
    return {"AVG word Length": int(sum / count)}
Beispiel #31
0
    def test_TokenizeIdentifiers(self):
        opperators = ["+","-","*","=","+=","-=",">","<","!=",">=","<="]
        keywords = ["make","if","else","return","class","method"]
        invalids = ["2er4",",sdf","@sd"]
        identifiers = ["x","y","count","total","r3","R2","totalMoney","i"]

        input = opperators + keywords + invalids + identifiers
        random.shuffle(input)
        output = [ word for word in input if Tokenize.TokenizeIdentifiers(word)]
        self.assertItemsEqual(output,identifiers,"find the indentifiers")
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
     fd = Ngrams.getNgramFreqDist(POStags,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1], reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["pos#"+str(i)+" "+str(n)+"gram"] = topM[i][0]
     return vector
def avgWordLengthBucketed(text):
    tokens = Tokenize.byWordAlphaOnly(text)
    sum = 0
    count = 0
    for token in tokens:
        sum += len(token)
        count += 1
    numericValue = int(sum / count)
    bucketLabel = "Long" if numericValue > 6 else "Medium" if numericValue > 4 else "Short"
    return {"AVG word Length": bucketLabel}
def typeTokenRatioBucketed(text, lengthFilter=None):
    tokens = Tokenize.byWord(text)
    if lengthFilter != None:
        tokens = [token for token in tokens if len(token) >= lengthFilter]
    types = set(tokens)
    return {
        "type/token":
        "HIGH" if len(types) / len(tokens) > .5 else
        "MEDIUM" if len(types) / len(tokens) > .2 else "LOW"
    }
def avgWordLengthBucketed(text):
    tokens = Tokenize.byWordAlphaOnly(text)
    sum = 0
    count = 0
    for token in tokens:
        sum += len(token)
        count +=1
    numericValue = int(sum/count)
    bucketLabel = "Long" if numericValue > 6 else "Medium" if numericValue > 4 else "Short"
    return {"AVG word Length" : bucketLabel}
def avgWordLength(text):
    text = " ".join(text)
    tokens = Tokenize.byWordAlphaOnly(text)
    sum = 0
    count = 0
    tokens = list(set(tokens))
    for token in tokens:
        if token.isalpha():
            sum += len(token)
            count +=1
    return {"AVG word Length" : int(sum/count)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     fd = Ngrams.getNgramFreqDist(text, n)
     topM = sorted([item for item in fd.items()],
                   key=lambda x: x[1],
                   reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["char#" + str(i) + " " + str(n) + "gramC"] = topM[i][0]
     return vector
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]
     types = set(tokens)
     return {
         "vocabSize":
         "HIGH"
         if len(types) > 50 else "MEDIUM" if len(types) > 20 else "LOW"
     }
def percentOfUpperLetters(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    uppers = 0
    total = 0    
    for c in text:
        if c.isupper():
            uppers +=1
        total += 1    
    percent = int(100*uppers/total)
    return {"percentUpperCase" : percent}
Beispiel #40
0
    def test_single_expression(self):
        tokens = Tokenize.tokenize("(define size 4)", 0)
        expList = [makeKeyword(Tokens.DEFINE), makeSymbol("size"), makeNumber(4)]
        expected = makeList(expList)

        result = list(parse(tokens))

        self.assertTrue(all(code == ParseError.OK for code, _ in result))
        actual = [item for _, item in result]
        self.assertEqual(1, len(actual))
        self.assertEqual(expected, actual[0])
def avgWordLength(text):
    text = " ".join(text)
    tokens = Tokenize.byWordAlphaOnly(text)
    sum = 0
    count = 0
    tokens = list(set(tokens))
    for token in tokens:
        if token.isalpha():
            sum += len(token)
            count += 1
    return {"AVG word Length": int(sum / count)}
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     if lengthFilter != None:
         tokens = [token for token in tokens if len(token) >= lengthFilter]
     types = set(tokens)
     return {
         "type/tokenb":
         "HIGH" if len(types) / len(tokens) > .5 else
         "MEDIUM" if len(types) / len(tokens) > .2 else "LOW"
     }
    def __init__(self, inputFile, alpha, beta, KTopics, vocab):
        self.corpus = self.loadCorpus(inputFile)
        self.vocab = vocab
        self.alpha = alpha
        self.beta = beta
        self.K = KTopics

        for i, doc in enumerate(self.corpus):
            docTerms = Tokenize.tokenizeText(doc)
            self.vocab = self.vocab + docTerms
            self.corpus[i] = docTerms

        self.vocab = list(set(self.vocab))

        self.D = len(self.corpus)
        self.theta = np.zeros(
            (self.D, self.K
             ))  # count of words assigned to topic k in doc D/ n_m_z / n_dk

        self.N = len(self.vocab)
        self.phi = np.zeros(
            (self.K,
             self.N))  # word count of each word in topic K / n_zt / n_kw

        self.z_dn = []  # topics of every word in doc D
        self.wordCount_k = np.zeros(self.K)  # word count of each topic K
        '''
        Initialize a random topic to every word in every document and appropriately change the phi and theta matrices.
        '''
        for i, doc in enumerate(self.corpus):
            z_d = []
            for word in doc:
                z = np.random.randint(
                    0,
                    self.K)  # a random topic drawn from the list of K topics
                z_d.append(z)
                '''
                A word has been assigned to topic z in the current doc i. Update the Doc-topic distribution
                '''
                self.theta[i][z] += 1
                '''
                The word 'word' has been assigned to topic z. So update the topic-word dist 
                '''
                self.phi[z][(self.vocab).index(word)] += 1
                '''
                The word count of topic z has to be incremented.
                '''
                self.wordCount_k[z] += 1

            z_d = np.array(z_d)
            self.z_dn.append(z_d)

        self.Phi = np.zeros(self.phi.shape)
        self.Theta = np.zeros(self.theta.shape)
def percentOfUpperLetters(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    uppers = 0
    total = 0
    for c in text:
        if c.isupper():
            uppers += 1
        total += 1
    percent = int(100 * uppers / total)
    return {"percentUpperCase": percent}
Beispiel #45
0
    def test_keywords_expression(self):
        tokens = Tokenize.tokenize("define cond if else set! lambda", 0)

        expected = [makeKeyword(Tokens.DEFINE), makeKeyword(Tokens.COND),
                    makeKeyword(Tokens.IF), makeKeyword(Tokens.ELSE),
                    makeKeyword(Tokens.ASSIGNMENT), makeKeyword(Tokens.LAMBDA)]

        result = list(parse(tokens))

        self.assertTrue(all(code == ParseError.OK for code, _ in result))
        actual = [item for _, item in result]
        self.assertEqual(expected, actual)
Beispiel #46
0
    def test_primitive_expression(self):
        tokens = Tokenize.tokenize('"I am a string" 1234 #t #f symbol null', 0)

        expected = [makeString('I am a string'), makeNumber(1234),
                    makeBoolean(True), makeBoolean(False),
                    makeSymbol("symbol"), makeNULL()]

        result = list(parse(tokens))

        self.assertTrue(all(code == ParseError.OK for code, _ in result))
        actual = [item for _, item in result]
        self.assertEqual(expected, actual)
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
     fd = Ngrams.getNgramFreqDist(POStags, n)
     topM = sorted([item for item in fd.items()],
                   key=lambda x: x[1],
                   reverse=True)[:m]
     vector = {}
     for i in range(len(topM)):
         vector["pos#" + str(i) + " " + str(n) + "gram"] = topM[i][0]
     return vector
 def feature(text):
     text = " ".join(text)
     tokens = Tokenize.byWord(text)
     fd = Ngrams.getNgramFreqDist(text,n)
     topM = sorted([item for item in fd.items()],key=lambda x:x[1],reverse=True)[:m]
     #print(topM)
     total = 0
     for p in topM:
         total += p[1]
     PDF = []
     for p in topM:
         PDF.append((p[0],p[1]/total))
     return dict(PDF[:m])
Beispiel #49
0
 def answer(self, text):
     if not Tokenize.valid_conf_code(text):
         print("Enter a valid verification code")
         return VerificationCodeState()
     print("We are verifying your Credit Card information. Wait a moment.")
     time.sleep(3)
     print("We are all set. Your Credit Card is valid.")
     print("Let's see your order: ")
     for game in order.games:
         print(game['name'] + " price: " + str(game['price']))
     print("Total: " + str(order.total))
     print("Do you want to proceed with the order?")
     return ConfirmCreditCardState()
Beispiel #50
0
    def test_multiple_expression(self):
        tokens = Tokenize.tokenize("(define size 4)(+ size 5)", 0)
        expListOne = [makeKeyword(Tokens.DEFINE), makeSymbol("size"), makeNumber(4)]
        expListTwo = [makeSymbol("+"), makeSymbol("size"), makeNumber(5)]
        expectedOne = makeList(expListOne)
        expectedTwo = makeList(expListTwo)

        result = list(parse(tokens))

        self.assertTrue(all(code == ParseError.OK for code, _ in result))
        actual = [item for _, item in result]
        self.assertEqual(2, len(actual))
        self.assertEqual(expectedOne, actual[0])
        self.assertEqual(expectedTwo, actual[1])
def percentOfLetters(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    vector = {}
    total = 0
    for i in range(26):
        vector["pL"+chr(i + ord('a'))] = 0
    for c in text.lower():
        if "pL"+c in vector.keys():
            vector["pL"+c] +=1
            total += 1
    for i in range(26):
        vector["pL"+chr(i + ord('a'))] = int(100*(vector["pL"+chr(i + ord('a'))]/total))
    return vector
Beispiel #52
0
def train_models(languages, pretrainded_head=5000):
    pretrainded_models = {}
    for language in languages:
        corpus = ''
        for i in range(1, 4):
            with open('Texts/{}/{}{}.txt'.format(language, language, i),
                      encoding='utf-8') as f:
                corpus += f.read().rstrip()
        ngrams = Tokenize.get_ngrams(corpus, ngram_size=3)
        pretrained = get_ngram_frequency(ngrams).head(pretrainded_head)
        pretrained.to_csv('{}_{}.csv'.format(language, pretrainded_head),
                          header=False)
        pretrainded_models[language] = pretrained
    return pretrainded_models
Beispiel #53
0
def featureNumericScore(sample):
    words = Tokenize.byWord(sample)  
    HSWords = loadHSWords()
    sentimentWordCount = 0
    score = 0
    for w in words:
        for s in HSWords:
            if w == s["word"]:
                score += s["score"]
                sentimentWordCount +=1
    #print("Raw score",score)
    score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1))
    #rating = 5 if score > 2 else 4 if score > 1 else 3 if score > -2 else 2 if score > -3 else 1
    #print("Ours:", rating, "Score", score)
    return {"HS raw score" : score}
def posDist(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
    possibleTags = PerceptronTagger().model.classes
    vector = {}
    total = 0
    for tag in possibleTags:
        vector[tag] = 0
    for tag in POStags:
        vector[tag] += 1
        total += 1
    for tag in possibleTags:
        vector[tag] = int(100 * vector[tag] / total)
    return vector
def percentOfLetters(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    vector = {}
    total = 0
    for i in range(26):
        vector["pL" + chr(i + ord('a'))] = 0
    for c in text.lower():
        if "pL" + c in vector.keys():
            vector["pL" + c] += 1
            total += 1
    for i in range(26):
        vector["pL" + chr(i + ord('a'))] = int(
            100 * (vector["pL" + chr(i + ord('a'))] / total))
    return vector
def posDist(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
    possibleTags = PerceptronTagger().model.classes
    vector = {}
    total = 0
    for tag in possibleTags:
        vector[tag] = 0
    for tag in POStags:
        vector[tag] += 1
        total +=1
    for tag in possibleTags:
        vector[tag] = int(100*vector[tag]/total)
    return vector
Beispiel #57
0
def featureBinaryScore(sample):
    words = Tokenize.byWord(sample)  
    HSWords = loadHSWords()
    sentimentWordCount = 0
    score = 0
    for w in words:
        for s in HSWords:
            if w == s["word"]:
                score += s["score"]
                sentimentWordCount +=1
    #print("Raw score",score)
    score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1))
    rating = "+" if score > 0 else "-"
    #print("Ours:", rating, "Score", score)
    return {"HS rating" : rating}
Beispiel #58
0
def featureHitCountBucketed(sample):
    words = Tokenize.byWord(sample)  
    HSWords = loadHSWords()
    sentimentWordCount = 0
    score = 0
    for w in words:
        for s in HSWords:
            if w == s["word"]:
                score += s["score"]
                sentimentWordCount +=1
    #print("Raw score",score)
    score = int(score / (sentimentWordCount if sentimentWordCount > 0 else 1))
    #rating = 5 if score > 2 else 4 if score > 1 else 3 if score > -2 else 2 if score > -3 else 1
    #print("Ours:", rating, "Score", score)
    return {"HS hit count" : "HIGH" if sentimentWordCount > 8 else "MEDIUM" if sentimentWordCount > 4 else "LOW"}
def wordLengthDist(text):
    text = " ".join(text)
    words = Tokenize.byWordAlphaOnly(text)
    vector = {}
    total = 0
    for i in range(1,11):
        vector["%ofwords"+str(i)+"long"] = 0
    count = 0
    words = list(set(words))
    for word in words:
        if len(word) < 10:
            vector["%ofwords"+str(len(word))+"long"] += 1 
        else:
            vector["%ofwords"+str(10)+"long"] += 1
        total +=1
    for i in range(1,11):
        vector["%ofwords"+str(i)+"long"] = int(100*vector["%ofwords"+str(i)+"long"]/total)
    return vector
def textLength(text):
    return {"text Length" : len(Tokenize.byWord(text))}