def sentiment_analysis(text, output_file=None): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) feats = word_feats(word_tokenize(text)) sentiment = classifier.classify(feats) write_tak(sentiment, output_file) return sentiment
def translate(txt, from_language, to_language, output_file=None): from_language_abbrv = None to_language_abbrv = None sentences_unfiltered = list() sentences_list = list() original_punctuation = 0 translated_punctuation = 0 original_punctuation += txt.count('.') original_punctuation += txt.count('!') original_punctuation += txt.count('?') ## Google won't return results in our expected form without at least one punctuation mark. Strange... if original_punctuation == 0: txt += "." original_punctuation += 1 ## Convert language name to Google's abbreviations for language, abbrv in languages.iteritems(): if language == to_language: to_language_abbrv = abbrv if language == from_language: from_language_abbrv = abbrv if to_language_abbrv is None or from_language_abbrv is None: exit("Please enter a valid language. Check spelling!") ## Prepare the URL txt = txt.replace(" ", "%20") url = "http://translate.google.com/translate_a/t?client=t&sl=" + from_language_abbrv + "&tl=" + to_language_abbrv + "&hl=en&sc=1&ie=UTF-8&oe=UTF-8&ssel=0&tsel=0&q=" url = url + txt headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5)' } ## Query Google and get back the unformatted JSON-like packets response = requests.get(url, headers=headers) unfiltered_tr = response.text pattern = re.compile(r'(?<=\[\")(.*?)(?=\",)') for (sentence) in re.findall(pattern, unfiltered_tr): sentences_unfiltered.append(sentence) for i in range(0, len(sentences_unfiltered)): ## Get an easier variable to work with sentence = sentences_unfiltered[i] ## Google has a tendency to add leading spaces between punctuation marks ## This is a lazy fix. NLTK fix for the future possibly? sentence = sentence.replace(" .", ".") sentence = sentence.replace(" !", "!") sentence = sentence.replace(" ?", "?") ###################################################################### if translated_punctuation >= original_punctuation: break else: ## Count appearances of punctuation marks in the translated text translated_punctuation += sentence.count('.') translated_punctuation += sentence.count('!') translated_punctuation += sentence.count('?') sentences_list.append(sentence) sentences = ''.join(sentences_list).encode('utf-8') print sentences write_tak(sentences, output_file) return sentences
def lower_case(txt, output_file=None): lowered = txt.lower() write_tak(lowered, output_file) return lowered
def format_html(txt, output_file=None): bs = BeautifulSoup(txt) formatted = bs.prettify().encode('utf-8') write_tak(formatted, output_file) return formatted
def character_count(txt, output_file=None): cnt = len(txt) write_tak(cnt, output_file) return cnt
def compress_text(txt, output_file=None): txt = txt.replace('\n','') write_tak(txt, output_file) return txt
def upper_case(txt, output_file=None): uppered = txt.lower() write_tak(uppered, output_file) return uppered
def strip_tags(html, output_file=None): s = MLStripper() s.feed(html) write_tak(s.get_data(), output_file) return s.get_data()