Python processText Examples

Programming Language: Python

Namespace/Package Name: langlib

Method/Function: processText

Examples at hotexamples.com: 2

Python processText - 2 examples found. These are the top rated real world Python examples of langlib.processText extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: langID.py Project: nkjain16/Lang-Identification

def train():
    """ Training mode. Assumes all training exercises have never been seen before. """
    
    for lang in languages.iterkeys():
        # Read in documents so that UTF-8 encoding is handled properly
        f = codecs.open('trainingTexts/' + languages[lang].fileName, encoding='utf-8')
        texts = f.read().split('====') # Individual documents are separated by ====
        f.close()
        
        for text in texts:
            processed = langlib.processText(text)
            
            # Convert processed into a Document representing a bag of words
            document = langlib.Document()
            previousWord = ''
            for word in processed:
                if word == previousWord:
                    document.words[word] += 1
                else:
                    document.words[word] = 1
                previousWord = word
            
            # Update the language's vocabulary and write to disk
            languages[lang].vocabUpdate(document)
            languages[lang].write()
            
            languages[lang].textCount += 1
            langlib.totalTextCount += 1
        
    # Update the language list file
    langlib.updateLangList(languages)

Example #2

Show file

File: langID.py Project: nkjain16/Lang-Identification

def classify(fileName):
    """ Classify mode. Identifies the language of the text specified in fileName """
    f = codecs.open(fileName, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    
    processed = langlib.processText(text)
    
    # Convert processed into a Document represnting a bag of words
    document = langlib.Document()
    previousWord = ''
    for word in processed:
        if word == previousWord:
            document.words[word] += 1
        else:
            document.words[word] = 1
        previousWord = word
    
    # Calculate the posterior probability of each language given the input
    langProbs = {}
    
    # Sum over all languages P(l) * Product over all words in document of P(word|l)
    # Using Laplace smoothing
    # IT IS PROBABLY POSSIBLE TO REMOVE EVIDENCE since it is simply a constant which
    # all probabilities are divided by
    """
    evidence = 0
    for l in languages.iterkeys():
        langProb = languages[l].textCount / float(langlib.totalTextCount)
        product = 1
        for word in document.words:
            if word in languages[l].vocab:
                product *= float(languages[l].vocab[word] + SMOOTHING_PARAM)
                product /= float(languages[l].wordCount + SMOOTHING_PARAM * languages[l].uniqueWordCount)
            else:
                product *= float(SMOOTHING_PARAM)
                product /= float(languages[l].wordCount + SMOOTHING_PARAM * languages[l].uniqueWordCount)
        evidence += langProb * product
    """
        
    for lang in languages.iterkeys():
        prior = languages[lang].textCount / float(langlib.totalTextCount)
        priorExponent = math.floor(math.log(prior, 10))
        priorSignificand = prior / (10 ** priorExponent)
        prior = [priorSignificand, priorExponent] # Scientific notation [significand, exponent]
        
        # Product over all words in document of P(word|lang) using Laplace smoothing
        likelihood = [1, 0] # Scientific notation [significand, exponent]
        denominator = float(languages[lang].wordCount + SMOOTHING_PARAM * languages[lang].uniqueWordCount)
        denomExponent = math.floor(math.log(denominator, 10))
        denomSignificand = denominator / (10 ** denomExponent)
        for word in document.words:
            if word in languages[lang].vocab:
                probability = float(languages[lang].vocab[word] + SMOOTHING_PARAM)
                
                #probExponent = math.floor(math.log(probability, 10))
                #probSignificand = probability / (10 ** probExponent)
            else:
                probability = float(SMOOTHING_PARAM)
                
            probExponent = math.floor(math.log(probability, 10))
            probSignificand = probability / (10 ** probExponent)
            
            # Divide the probability by the denominator which applies to all words in a given language
            probSignificand /= denomSignificand
            probExponent -= denomExponent
            
            # Because numbers get very tiny, work in scientific notation
            # significand * (10 ** exponent)
            #probExponent = math.floor(math.log(probability, 10))
            #probSignificand = probability / (10 ** probExponent)
            #print probSignificand, probExponent
            likelihood[0] *= probSignificand
            likelihood[1] += probExponent
        
        langProbs[lang] = [prior[0] * likelihood[0], prior[1] + likelihood[1]] # Scientific notation
        #langProbs[lang] = prior * likelihood / evidence # May not be necessary to use evidence
        
        # Normalize so that significand < 10
        normalizerExponent = math.floor(math.log(langProbs[lang][0], 10))
        langProbs[lang][0] /= 10 ** normalizerExponent
        langProbs[lang][1] += normalizerExponent
   
    for lang in langProbs:
        print lang, langProbs[lang]