def parsing(data, phrases): subjective = [] objective = [] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') val = sent_detector.tokenize(data.strip()) for sent in val: sub_tags = [] obj_tags = [] tok_sent = nltk.word_tokenize(sent) pos = nltk.pos_tag(tok_sent) sent_status = False for p in phrases: if p in sent: for tags in pos: sub_tags.append(tags[1]) subjective.append(sub_tags) #print p, "--", sent, "--", pos sent_status = True break if not sent_status: for tags in pos: obj_tags.append(tags[1]) objective.append(obj_tags) return subjective, objective
def ToSentences(data, tokenizer, concept, stem, removeStopwords): data = tokenizer.tokenize(data.strip()) sentences = [] for s in data: if len(s) > 0: sentences.append(Tokenization(s, concept, stem, removeStopwords)) return sentences
def checkGrams(data, phrases): sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') val = sent_detector.tokenize(data.strip()) for sent in val: tokens = nltk.tokenize.word_tokenize(sent) tok_cnt = Counter(ngrams(tokens, 2)) if p in sent:
def __text_to_sentences(self, data): raw_sentences = self.tokenizer.tokenize(data.strip()) sentences = [] for raw_sentence in raw_sentences: if len(raw_sentence) > 0: sentences.append(self.__sentence_to_wordlist(raw_sentence)) return sentences
def get_summary(): ''' Calculate degree centrality and thus the summary of the text output:valid_sentences, matrix, "sentence_links.txt" ''' global sentenceList,n global matrix global sentence_similarities global valid_sentences global sentence_rank key_max = max(sentence_similarities.keys(), key=(lambda k: sentence_similarities[k])) key_min = min(sentence_similarities.keys(), key=(lambda k: sentence_similarities[k])) threshold = float(sentence_similarities[key_max]+sentence_similarities[key_min])/float(2) # select the sentences with weights greater than the given threshold for i in sentence_similarities: if sentence_similarities[i]>threshold: nodes=[] nodes=i.split() valid_sentences.append(nodes) # fill the matrix with mappings according to similarities for i in range(n): for lis in valid_sentences: if str(i) in lis: if str(i) not in matrix: matrix[str(i)]=[] node = lis[(lis.index(str(i))+1)%2] if node not in matrix[str(i)]: matrix[str(i)].append(node) # write the matrix in the file 'sentence_links.txt' with open('sentence_links.txt','w') as f: for key in matrix: f.write(key) for i in matrix[key]: f.write(" "+i) f.write('\n') f.close() data = os.popen('python 5_pageRank.py sentence_links.txt 2').read() #print "\n","######## SUMMARY #######\n " a=data.strip().split('\n') d=a[2:] d.sort() rank_list=[] summary = '' for i in d: summary+= sentenceList[int(i)].capitalize() #print "\n" print summary
def breakTextFile(path_to_file, path_to_test_group, min_len=150): # creating the directory if not os.path.exists(path_to_test_group): os.makedirs(path_to_test_group) with open(path_to_file, 'r') as myfile: data=myfile.read() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') current_sentence = '' test_count = 1 for sentence in sent_detector.tokenize(data.strip()): current_sentence = current_sentence + ' ' + sentence if len(current_sentence) > min_len: # write to text file output_path = path_to_test_group + '/testcaseno' + str (test_count) + '.txt' with open(output_path, "w") as text_file: text_file.write(current_sentence[1:]) test_count = test_count + 1 current_sentence = ''
def data_to_sentences(data, tokenizer, remove_stopwords=False): # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # # 1. Use the NLTK tokenizer to split the paragraph into sentences #print data raw_sentences = tokenizer.tokenize(data.strip()) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words sentences.append( data_to_wordlist( raw_sentence, \ remove_stopwords )) #print sentences # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
def data_to_sentences(data, tokenizer, remove_stopwords=False ): # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # # 1. Use the NLTK tokenizer to split the paragraph into sentences #print data raw_sentences = tokenizer.tokenize(data.strip()) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words sentences.append( data_to_wordlist( raw_sentence, \ remove_stopwords )) #print sentences # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) return cleaned.strip() if __name__ == '__main__': folder = sys.argv[1] output = sys.argv[2] sentTotalCnt = 0 for root, dirnames, filenames in os.walk(folder): for filename in filenames: fp = open(os.path.join(root, filename)) data = fp.read() data = unicode(data.strip(), errors='ignore') data = clean_html(data) count = 0 for sentence in tokenizer.tokenize(data): sentence= sentence.strip() diretory = str(sentTotalCnt/10000).zfill(5) pathname = output+'/'+diretory+'/'+filename+'.'+str(count) if not os.path.exists(os.path.dirname(pathname)): try: os.makedirs(os.path.dirname(pathname)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise writeF = open(pathname, 'w') # sentence = unicode(sentence.strip(), errors='ignore') writeF.write(sentence.strip())
import nltk.data import re import string with open( 'C://Users//jbjb//Documents//DATA//weird corpus//extras//poundearly.txt', 'r', encoding='utf-8') as f**k: data = f**k.read().replace('\n', ' ') sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') print('\n-----\n'.join(sent_detector.tokenize(data.strip())))
def _sentences(data): # Filter non-ascii data = filter(lambda x: x in printable, data) sentences = sent_detector.tokenize(data.strip()) return sentences
import nltk.data sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') input_file_name = "input.txt" output_file_name = "output.txt" with open(input_file_name, 'r') as input_file: data=input_file.read().decode("utf-8").replace('\n', '') # print '\n---\n'.join(sent_detector.tokenize(data.strip())) output_file = open(output_file_name, 'w') for item in sent_detector.tokenize(data.strip()): output_file.write("%s\n" % item.encode('utf-8')) input_file.close() output_file.close()