def sentiment_analysis(text, output_file=None):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    classifier = NaiveBayesClassifier.train(trainfeats)

    feats = word_feats(word_tokenize(text))
    sentiment = classifier.classify(feats)
    write_tak(sentiment, output_file)
    return sentiment
Example #2
0
def translate(txt, from_language, to_language, output_file=None):
    from_language_abbrv = None
    to_language_abbrv = None
    sentences_unfiltered = list()
    sentences_list = list()
    original_punctuation = 0
    translated_punctuation = 0
    original_punctuation += txt.count('.')
    original_punctuation += txt.count('!')
    original_punctuation += txt.count('?')
    
    ## Google won't return results in our expected form without at least one punctuation mark. Strange...
    if original_punctuation == 0:
        txt += "."
        original_punctuation += 1
        
    ## Convert language name to Google's abbreviations
    for language, abbrv in languages.iteritems():
        if language == to_language:
            to_language_abbrv = abbrv
        if language == from_language:
            from_language_abbrv = abbrv
    if to_language_abbrv is None or from_language_abbrv is None:
        exit("Please enter a valid language. Check spelling!")
    
    ## Prepare the URL
    txt = txt.replace(" ", "%20")
    url = "http://translate.google.com/translate_a/t?client=t&sl=" + from_language_abbrv + "&tl=" + to_language_abbrv + "&hl=en&sc=1&ie=UTF-8&oe=UTF-8&ssel=0&tsel=0&q="
    url = url + txt
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5)'
    }
    ## Query Google and get back the unformatted JSON-like packets
    response = requests.get(url, headers=headers)
    unfiltered_tr = response.text
    
    pattern = re.compile(r'(?<=\[\")(.*?)(?=\",)')
    for (sentence) in re.findall(pattern, unfiltered_tr):
        sentences_unfiltered.append(sentence)
    
    for i in range(0, len(sentences_unfiltered)):
        ## Get an easier variable to work with
        sentence = sentences_unfiltered[i]
        ## Google has a tendency to add leading spaces between punctuation marks
        ## This is a lazy fix. NLTK fix for the future possibly?
        sentence = sentence.replace(" .", ".")
        sentence = sentence.replace(" !", "!")
        sentence = sentence.replace(" ?", "?")
        ######################################################################
        if translated_punctuation >= original_punctuation:
            break
        else:
            ## Count appearances of punctuation marks in the translated text
            translated_punctuation += sentence.count('.')
            translated_punctuation += sentence.count('!')
            translated_punctuation += sentence.count('?')
            sentences_list.append(sentence)
            
    sentences = ''.join(sentences_list).encode('utf-8')
    print sentences
    write_tak(sentences, output_file)
    return sentences
def lower_case(txt, output_file=None):
    lowered = txt.lower()
    write_tak(lowered, output_file)
    return lowered
def format_html(txt, output_file=None):
    bs = BeautifulSoup(txt)
    formatted = bs.prettify().encode('utf-8')
    write_tak(formatted, output_file)
    return formatted
def character_count(txt, output_file=None):
    cnt = len(txt)
    write_tak(cnt, output_file)
    return cnt
def compress_text(txt, output_file=None):
    txt = txt.replace('\n','')
    write_tak(txt, output_file)
    return txt
def upper_case(txt, output_file=None):
    uppered = txt.lower()
    write_tak(uppered, output_file)
    return uppered
def strip_tags(html, output_file=None):
    s = MLStripper()
    s.feed(html)
    write_tak(s.get_data(), output_file)
    return s.get_data()