def parseTextToSentences(text): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) data = text data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) return sentences
def removeTag(data): cleanData = '' for word in data.split(): if '<' and '>' in word: word = '. ' cleanData = cleanData + word + ' ' return cleanData
def process_tokenized(data): """Process tokenized data and remove the newlines.""" newdata = "" data = data.split("\n") for line in data: if LINE_SEPARATOR not in line: newdata = "{}{}".format(newdata, line.replace("\n", "")) else: newdata = "\n\n{}\n{}\n".format(newdata, line) return newdata
def gloss_count_ratio(data, get_ratio=0): term_count = 0 word_count = 0 for key, values in allkeys.iteritems(): if key in data: term_count += 1 if (get_ratio): word_count += len(data.split()) return term_count, float(term_count) / word_count else: return term_count
def parse_text_to_sentences(text): punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) data = text data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) return sentences
def extract_labels(self, binary=True): """ Loads labeled dataframe and extract list of labeled subtitle ids. """ data = self.key_to_labels.get_contents_as_string() valid_ratings = self.context.broadcast([u'R', u'PG-13', u'PG', u'G', u'NC-17']) labels_rdd = self.context.parallelize(data.split('\n')) \ .filter(lambda line: line != '' and not 'IDSubtitle' in line) \ .map(lambda line: (line.split(',')[0], line.split(',')[1])) \ .filter(lambda (file_id, rating): rating in valid_ratings.value) if binary: labels_rdd = labels_rdd.map(lambda (file_id, rating): (file_id, (rating != 'R')*'NOT_' + 'R')) return labels_rdd.sortByKey().cache() # for lookups
def gloss_count_ratio(data, get_ratio=0): term_count = 0 word_count = 0 for key, values in allkeys.iteritems(): if key in data: #Found term. So increase its count recalldict[key] += 1 term_count += 1 if (get_ratio): word_count += len(data.split()) #print "{},{},{}".format(float(term_count)/word_count,term_count,word_count) return float(term_count) / word_count else: return term_count
def prepare_text(author=None, data=None): # text_column = "text" # author_column = "author" # train_file = os.path.join(os.getcwd(), "my_train.csv") # test_file = os.path.join(os.getcwd(), "my_test.csv") # sentences = tokenize_text(data) sentences = data.split(" ") train_sentences = " ".join(sentences[:int(len(sentences) * 0.50)]) test_sentences = " ".join(sentences[int(len(sentences) * 0.50):]) train_res = [author] * 2 test_res = [author] * 2 return train_sentences, test_sentences, train_res, test_res
def handleonetxtfile(inpname): basename = os.path.basename(inpname).split('.')[0] basename_txt = os.path.join(basename + ".txt") outname = os.path.join(basename + "_setnence_converted.txt") tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open(basename_txt) data = fp.read() paragraphs = [p for p in data.split('\n') if p] with open(outname, "a") as out: for paragraph in paragraphs: out.write('\n') out.write('\n'.join(tokenizer.tokenize(paragraph))) out.write('\n') out.close()
def splitIntoSentences2(file_name): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) fp = open(file_name) data = fp.read() data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) # print '\n-----\n'.join(sentences) return sentences
def Tokenization(data, concept, stem, removeStopwords): if concept == False: data = BeautifulSoup(data).get_text() data = re.sub("\r\n", " ", data) data = re.sub("[^a-zA-Z0-9_]", " ", data) data = data.lower() if stem == True: stemmer = PorterStemmer() data = stemmer.stem(data) words = data.split() if removeStopwords == True: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return words
def split_sentences_nltk( file): #returns the topcs separated into sentenses and topic titles try: tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') except: nltk.download('punkt') # needed only for the first time tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') data = file.read() data_1 = data.split("changeTopicHere.") topics, data_1 = find_topic(data_1) topics_data = [] i = 0 for x in xrange(0, len(data_1)): topics_data.append("\n".join(tokenizer.tokenize(data_1[x]))) return topics, topics_data
def openparsefile(self): self.clear() self.filename = tkFileDialog.askopenfilename(**self.file_opt) if not self.filename: return data = open(self.filename, 'r').read() self.initree = {} filtdata = "" self.ininame = {} for entry in data.split('\n\n'): if not entry: continue for line in entry.split("\n"): elements = line.split() if len(elements) < 4: continue if len(elements) == 5: lex_list = word_to_features(elements[0]) fset = lex_search(lex_list, {}, self.alltrees) for cata in fset: if elements[4] in fset[cata]: tree = fset[cata][elements[4]] tree.lexicalize() tree._lex = False self.initree[elements[0]] = tree self.ininame[elements[0]] = elements[4] self.relmap[elements[0]] = elements[1] self.tagmap[elements[0]] = elements[3] filtdata += "\t".join(elements[:4]) filtdata += '\n' filtdata += "\n\n" self.graphs = [ DependencyGraph(entry) for entry in filtdata.split('\n\n')[:-1] if entry ] trees = [graph.tree() for graph in self.graphs] count = 0 for t in trees: key = 'tree ' + str(count) self._import_trees[key] = t count += 1 self.populate_tree('', self._import_trees) self._import_trees_selected = True self.init_selector(trees[0])
def openparsefile(self): self.clear() self.filename = tkFileDialog.askopenfilename(**self.file_opt) if not self.filename: return data = open(self.filename, 'r').read() self.initree = {} filtdata = "" self.ininame = {} for entry in data.split('\n\n'): if not entry: continue for line in entry.split("\n"): elements = line.split() if len(elements) < 4: continue if len(elements) == 5: lex_list = word_to_features(elements[0]) fset = lex_search(lex_list, {}, self.alltrees) for cata in fset: if elements[4] in fset[cata]: tree = fset[cata][elements[4]] tree.lexicalize() tree._lex = False self.initree[elements[0]] = tree self.ininame[elements[0]] = elements[4] self.relmap[elements[0]] = elements[1] self.tagmap[elements[0]] = elements[3] filtdata += "\t".join(elements[:4]) filtdata += '\n' filtdata += "\n\n" self.graphs = [DependencyGraph(entry) for entry in filtdata.split('\n\n')[:-1] if entry] trees = [graph.tree() for graph in self.graphs] count = 0 for t in trees: key = 'tree '+ str(count) self._import_trees[key] = t count += 1 self.populate_tree('', self._import_trees) self._import_trees_selected = True self.init_selector(trees[0])
date = sys.argv[2] article = 1 total_avg_polarity = 0 while (article <= 5): file = stock + '/' + date + '_2016/' + str(article) + '.txt' try: fp = open(file) article += 1 data = fp.read() # print sentences title = data.split('\n', 1)[0] content = data # Create a SummaryTool object st = SummaryTool() # Build the sentences dictionary sentences_dic = st.get_sentences_ranks(content) # Build the summary with the sentences dictionary sentences = st.get_summary(title, content, sentences_dic) # print sentences ## instantiate senticnet
root = inp.getroot() candidates = [] for child in root.iter(): if child.tag == "P": child = tokenizer.tokenize(child.text) for i in child: candidates.append(i) data = "Discuss conditions on American Indian reservations or among Native American communities. Include the benefits and drawbacks of the reservation system. Include legal privileges and problems." print "Query Sentence" print data print "\n" data = data.split(" ") queryRel = [] for word in data: for i,j in enumerate(wn.synsets(word)): for l in j.lemmas(): queryRel.append(l.name()) #queryRel.append(l.lemma_names() for l in j.hypernyms()) for l in j.hypernyms(): for k in l.lemma_names(): queryRel.append(k) for l in j.hyponyms(): for k in l.lemma_names(): queryRel.append(k) def LLR(e):
def file_to_words(self): data = None with open(self.corpus_file) as corpus: data = corpus.read() words = data.split() return words
def get_raw_paragraph( fileid ): #TODO test if this works with yahoo! corpus as well (encoding might differ) data = corpus.raw(fileid) return data.split(u"\r\n \r\n")
# print(x) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') data = f.read() result = ('\n------\n'.join(tokenizer.tokenize(data))) # print(result) # data = f.read() # for e in firstprocess: # # print(e) # # print(e.index('\n')) # s = e.split() # print(s) # break allSentences = [] data2 = data.split() newWord = False upperCaseWord = False lowerCaseWord = False currentSentence = "" for e in data2: if e.isupper(): upperCaseWord = True if lowerCaseWord is True: newWord = True allSentences.append(currentSentence) currentSentence = currentSentence + ' ' + e if e.islower(): if upperCaseWord is True: newWord = True
import nltk.data import nltk import language_check import string import re tool = language_check.LanguageTool('en-US') nltk.download('punkt') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') lines = [] fp = open("./data/dbt_handouts_pass1.txt") data = fp.read() data = data.replace("\n", ' ').replace('•• ', '').replace('*', '').replace('Other:', '') data = re.sub(r'\d+', ' ', data) data = re.sub(r'[^\x00-\x7f]', r'', data) data = ' '.join(data.split()) for x in tokenizer.tokenize(data): matches = tool.check(x) words = x.split(' ') if len(matches) == 0 and len(words) >= 8: lines.append(x) print('\n'.join(lines)) f = open("./data/dbt_handouts_pass2.txt", "w") f.write('\n'.join(tokenizer.tokenize(data))) f.close()
num=0 for fileName in list_of_files: s=[] p=[] r=[] t=[] q=[] a=[] fin=open(fileName,"r") data=fin.read() print(fileName,"check3") print(data) #inp=input("enter your opinion:") inp=input("press 1 for sports 2 for education 3 for entertainment 4 for politics 5 for business") print(inp) for line in data.split('\n'): #print(line) r.append(line) #print(type(s)) s.append(word_tokenize(line)) #print(word_tokenize(line)) fin.close() #closes file #print(s) for i in range(len(s)): r.append(i) #print(r) t = word_tokenize(str(r)) print(t) p = nltk.corpus.stopwords.words('english') #print(p)
continue except: continue try: if (len(note.get_text().split())) == 1: footNote = note.attrs['n'] + ' ' + note.get_text() dic[note.attrs['n']] = note.get_text() if (len(note.get_text().split())) > 1: footNote = note.attrs['n'] + ' ' + note.get_text() for word in footNote.split(): #if word[:5] == 'https': if 'htt' in word: data = data + word + ' ' if word.isdigit(): data = data + word + ' ' for multi in data.split(): if multi.isdigit(): key = multi continue dic[key] = multi listXMLfile.append(filenameXML) outputCSV_writer.writerow([filenameXML, dic[key]]) print(count) except: continue #print(dic) ''' #// extracting the sentance cleanSentence = None sentences = split_into_sentences(contents)
cmu = cmudict.dict() # webster = loadWebster(webster) ################## open file ################## tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open("sample.txt") # fp = open("collier.txt") # fp = open("frost_woods.txt") # fp = open("pope_windsor_forest.txt") # fp = open("paradise_lost.txt") # fp = open("sonnet_xv.txt") # fp = open("thomson_seasons.txt") data = fp.read() data = data.split('\n') ## line breaking. # exclude = set(string.punctuation) exclude = set('!"#$%&()*+,./:;<=>?@[\\]^_`{|}~') # exclude all string.punctuation except apostrophe (I think?) # and hyphen (29 may 2013) # note that i remove all of string.punctuation at the end of # the replED function lines = [] #to store the poem for datum in data: ''' This is really ugly, but, I needed to replace -'d endings with -ed endings, and the only place I could think of doing it was when first creating the lines. As this loop starts, apostrophes should be the only punctuation