def preprocess_hotel_review(file_contents, file_contents_test): """ Hotel review preprocess and truthfulness of the hotel review :param file_contents: :param file_contents_test: """ raw = clean_html(file_contents) raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw) sentence_list = tokenize.line_tokenize(raw) print sentence_list truth_sentences = [] false_sentences = [] for sentence in sentence_list: sent_arr = re.split(r',', sentence) try: is_truthful = int(sent_arr[0]) except ValueError: print "is_truthful is not an integer" if is_truthful == 1: truth_sentences.append(sent_arr[2]) elif is_truthful == 0: false_sentences.append(sent_arr[2]) truth_uni_prob_dict, truth_bi_prob_dict = process_prob(" ".join(truth_sentences)) false_uni_prob_dict, false_bi_prob_dict = process_prob(" ".join(false_sentences)) raw_test = clean_html(file_contents_test) raw_test = re.sub(r'IsTruthFul,review', "", raw_test) sentence_list_test = tokenize.line_tokenize(raw_test) test_list = [] test_truth_false_list = [] truth_count = false_count = i = 0 for sentence in sentence_list_test: sent_arr = re.split(r',', sentence) truth_uni_perplex, truth_bi_perplex = perplexity(sent_arr[1], truth_uni_prob_dict, truth_bi_prob_dict) false_uni_perplex, false_bi_perplex = perplexity(sent_arr[1], false_uni_prob_dict, false_bi_prob_dict) test_list.append((sent_arr[1], truth_bi_perplex, false_bi_perplex)) truth_or_false = 1 if truth_bi_perplex < false_bi_perplex else 0 #truth_or_false = 1 if truth_uni_perplex < false_uni_perplex else 0 if truth_or_false: truth_count += 1 else: false_count += 1 test_truth_false_list.append([i, truth_or_false]) i += 1 import csv with open("kaggle_sharp.csv", "wb") as f: writer = csv.writer(f) writer.writerows([['Id', 'Label']]) writer.writerows(test_truth_false_list) print test_list print test_truth_false_list print truth_count print false_count
def compare(request): errors = [] statistics=[] stats=[] for x in range(1,3): cantoname = "canto"+str(x)+".txt" w=PlaintextCorpusReader("./",cantoname); w.words(); t=nltk.text.Text(w.words()); l_lines=len(line_tokenize(w.raw())) l_uwords=len(set(w.words())) l_words=len(w.words()) l_sents=len(w.sents()) l_paras=len(w.paras()) l_linperpara=l_lines/l_paras statistics.append(x) statistics.append("Number of Words - "+ str(l_words)) statistics.append("Number of Unique Words - "+ str(l_uwords)) statistics.append("Number of Setences - "+ str(l_sents)) statistics.append("Number of Lines - "+ str(l_lines)) statistics.append("Number of Paras - "+ str(l_paras)) statistics.append("Number of Lines/Paras - "+ str(l_linperpara)) lexical_density=l_words/l_uwords l_wordpersent = l_words/l_sents statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density)) statistics.append("Words per sentence - "+ str(l_wordpersent)) stats.append(statistics) return render_to_response('compare.html', {'stats':statistics})
def stats(request): errors = [] statistics=[] if 'q' in request.GET: q = request.GET['q'] if not q: errors.append('Enter a Canto Number') else: cantoname = "canto"+q+".txt" w=PlaintextCorpusReader("./",cantoname); w.words(); t=nltk.text.Text(w.words()); l_lines=len(line_tokenize(w.raw())) l_uwords=len(set(w.words())) l_words=len(w.words()) l_sents=len(w.sents()) l_paras=len(w.paras()) l_linperpara=l_lines/l_paras statistics.append("Number of Words - "+ str(l_words)) statistics.append("Number of Unique Words - "+ str(l_uwords)) statistics.append("Number of Setences - "+ str(l_sents)) statistics.append("Number of Lines - "+ str(l_lines)) statistics.append("Number of Paras - "+ str(l_paras)) statistics.append("Number of Lines/Paras - "+ str(l_linperpara)) lexical_density=l_words/l_uwords l_wordpersent = l_words/l_sents statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density)) statistics.append("Words per sentence - "+ str(l_wordpersent)) return render_to_response('stats.html', {'statistics':statistics}) return render_to_response('stats.html', {'errors': errors})
def parse_corpus(self, corpus_type): self.corpus_type = '%s.txt' %corpus_type self.corpus = line_tokenize(PlaintextCorpusReader(CORPUS_PATH, self.corpus_type).raw().lower()) if corpus_type == 'institution': for line in range(len(self.corpus)): self.corpus[line] = self.corpus[line].split(',') return self.corpus
def entries(self, fileids=mwa_ppdb_xxxl_file): """ :return: a tuple of synonym word pairs. """ return [ tuple(line.split('\t')) for line in line_tokenize(self.raw(fileids)) ]
def __init__(self, doc_dir): convertion_style = "" parse = Parser(join(ROOT, 'templates', 'event.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ') self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+')
def __init__(self, doc_dir): convertion_style = "-raw" self._eventextractor = EventExtractor(doc_dir) parse = Parser(join(ROOT, 'templates', 'periodic.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
def load_topic_words(topic_file): """ given a path to a .ts file returns a dictionary of type { string : float } mapping topic words to their chi square scores """ topic_words_dict = dict() raw = open(topic_file).read() lines = line_tokenize(raw) for line in lines: # no cutoff outside of TopicS 0.1 pair = line.split(" ") topic_words_dict[pair[0]] = float(pair[1]) return topic_words_dict
def __init__(self, doc_dir): convertion_style = "" parse = Parser(join(ROOT, 'templates', 'tcc.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] pages = self._template_metadata['pages'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1], convertion_style) self._linetokenized_onepage_raw_doc = open('%s.txt' %self._preparator.doc_dir).readlines() self._clean_variouspages_doc = self._raw_variouspages_doc.replace('\n', ' ') self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._wordtokenized_onepage_doc = self._preparator.wordtokenized_punctuation_exclusion(self._raw_onepage_doc) self.linebreak = "\n"
def words(self, lang=None, fileids=None, ignore_lines_startswith='#'): """ This module returns a list of nonbreaking prefixes for the specified language(s). >>> from nltk.corpus import nonbreaking_prefixes as nbp >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J'] True >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89'] True :return: a list words for the specified language(s). """ # If *lang* in list of languages available, allocate apt fileid. # Otherwise, the function returns non-breaking prefixes for # all languages when fileids==None. if lang in self.available_langs: lang = self.available_langs[lang] fileids = ['nonbreaking_prefix.'+lang] return [line for line in line_tokenize(self.raw(fileids)) if not line.startswith(ignore_lines_startswith)]
def main(): reader = WordListCorpusReader(path, ['banbagsfb.txt']) pages = line_tokenize(reader.raw()) thispage = pages[4] thispage = thispage.raw() """ The easiest way to deal with strings in Python that contain escape characters and quotes is to triple double-quote the string (""") and prefix it with r. For example: my_str = r"""This string would "really "suck"" to write if I didn't know how to tell Python to parse it as "raw" text with the 'r' character and triple " quotes. Especially since I want \n to show up as a backlash followed by n. I don't want \0 to be the null byte either!""" The r means "take escape characters as literal". The triple double-quotes (""") prevent single-quotes, double-quotes, and double double-quotes from prematurely ending the string. """ m = re.search("(\d)", thispage) thisitem = m.group(0) m = re.search("(\d\d\D\d\d)", thispage) thisdate = m.group(0) starturl = thispage.find('http') endurl = thispage.find(' ', starturl)-2 thisurl = thispage[starturl:endurl] soup = BeautifulSoup(thispage) newpage = soup.findAll(text=True) html = replace_all(newpage, reps) html = html[11:len(html)] postdate = html[0:5] posttext = html[5:len(html)] print "post date = " + postdate print "post text = " + posttext def replace_all(txt, reps): for i, j in reps.iteritems(): txt = txt.replace(i, j) return text if __name__ == "__main__": main()
def words(self, fileids=None, ignore_lines_startswith='\n'): return [line for line in line_tokenize(self.raw(fileids)) if not line.startswith(ignore_lines_startswith)]
# print(file_path) # print(content) length = len(content) if content is not None else content fbar.set_description(f'{file_path}: {length}') # TYPE if content is None: texts = [''] elif text_type == 'full': texts = [content] elif text_type == 'parablank': texts = [] for p in blankline_tokenize(content): texts.append(p) elif text_type == 'paraline': texts = [] for p in line_tokenize(content): texts.append(p) else: raise NotImplementedError(text_type) # NORM if norm_type == 'stem': texts = [ ' '.join( snow.stem(x) for x in word_tokenize(y) if x.isalnum() and x.lower() not in stop) for y in texts ] elif norm_type == 'lem': texts = [ ' '.join( morph.parse(x)[0].normal_form
# Load Libraries from nltk.tokenize import line_tokenize sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled \ peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled \ peppers, Wheres the peck of pickled peppers Peter Piper picked ?" sent_list = line_tokenize(sentence) print "No sentences = %d"%(len(sent_list)) print "Sentences" for sent in sent_list: print sent # Include new line characters sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled\n \ peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled\n \ peppers, Wheres the peck of pickled peppers Peter Piper picked ?" sent_list = line_tokenize(sentence) print "No sentences = %d"%(len(sent_list)) print "Sentences" for sent in sent_list: print sent
def entries(self, fileids=mwa_ppdb_xxxl_file): """ :return: a tuple of synonym word pairs. """ return [tuple(line.split('\t')) for line in line_tokenize(self.raw(fileids))]
# Load Libraries from nltk.tokenize import line_tokenize sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled \ peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled \ peppers, Wheres the peck of pickled peppers Peter Piper picked ?" sent_list = line_tokenize(sentence) print "No sentences = %d" % (len(sent_list)) print "Sentences" for sent in sent_list: print sent # Include new line characters sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled\n \ peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled\n \ peppers, Wheres the peck of pickled peppers Peter Piper picked ?" sent_list = line_tokenize(sentence) print "No sentences = %d" % (len(sent_list)) print "Sentences" for sent in sent_list: print sent
print('{0}: {1}, '.format(k, ss[k]), end='') print() """ from nltk.tokenize import sent_tokenize, word_tokenize,line_tokenize f = open("2012LTinsultsLKML.tsv.txt", "r") all_text = f.read() print(all_text) #for i in word_tokenize(all_text): # print(i) print(word_tokenize(all_text)) t_hash = "#" for i in line_tokenize(all_text): if t_hash not in i: print(i) # preprocessing the text to remove the unwanted part and making the whole text in lowercase import re def pre_process(text): # lowercase text=text.lower() #remove tags text=re.sub("<!--?.*?-->","",text) """ # remove special characters and digits text=re.sub("(\\d|\\W)+"," ",text)
import nltk from nltk import word_tokenize from nltk.tokenize import line_tokenize from nltk.corpus import gutenberg from nltk.model import build_vocabulary from nltk.model import count_ngrams from nltk.model import MLENgramModel from nltk.model import LidstoneNgramModel # load doc into memory raw = open('datasets/WW_Dataset.txt', 'r').read() print(raw[:75]) tokens = word_tokenize(raw) print(len(tokens)) lines = line_tokenize(raw) test_lines = lines[3:5] test_words = [w for s in test_lines for w in s] print(test_words[:5]) corpus = [w.lower() for w in tokens] text = nltk.Text(tokens) words = [w.lower() for w in tokens] print(words[:10]) vocab = sorted(set(words)) print(len(vocab)) spl = int(95*len(corpus)/100) train = text[:spl] test = text[spl:] vocab = build_vocabulary(2, words) bigram_counts = count_ngrams(2, vocab, text)
def words(self, fileids=None): return line_tokenize(self.raw(fileids))
perplexity(file_contents_test, unigrams_probability_dict, bigrams_probability_dict) <<<<<<< Updated upstream def preprocess_hotel_review(file_contents, file_contents_test): """ Hotel review preprocess and truthfulness of the hotel review :param file_contents: :param file_contents_test: """ ======= def truthful_hotel_review(file_contents, file_contents_test): >>>>>>> Stashed changes raw = clean_html(file_contents) raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw) sentence_list = tokenize.line_tokenize(raw) #print sentence_list truth_sentences = [] false_sentences = [] for sentence in sentence_list: sent_arr = re.split(r',', sentence) try: is_truthful = int(sent_arr[0]) except ValueError: print "is_truthful is not an integer" if is_truthful == 1: truth_sentences.append(sent_arr[2]) elif is_truthful == 0: false_sentences.append(sent_arr[2])