def load_db(): print "Reading raw data..." tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open('db_train') data = fp.read() print "Tokenizing..." sentences = tokenizer.tokenize(data.decode('utf8')) sentences = ["%s %s %s" % (etc.sentence_start_token, x, etc.sentence_end_token) for x in sentences] tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] print "Vocabulary building..." word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(etc.voca_size-1) index_to_word = [x[0] for x in vocab] index_to_word.append(etc.unknown_token) word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else etc.unknown_token for w in sent] # Create the training data x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) return x_train, y_train, index_to_word, word_to_index
def pull_sentences(filename): """ Breaks abstract into sentences """ tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open(filename) data = fp.read() return tokenizer.tokenize(data.decode('utf-8'))
def docsplitter(Document): SourceTextArray = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open(Document) data = fp.read() data = data.decode('utf-8') SourceTextArray.append(tokenizer.tokenize(data)) SourceTextArray = SourceTextArray[0] return SourceTextArray
def downloadfile(url, debug = False): data = download(url) try: if url.endswith('.txt'): return data.decode('utf-8') elif url.endswith('.pdf'): return decodepdf(BytesIO(data), debug=debug) except KeyboardInterrupt: raise except: pass
def sentenceSplitNLTK(inFile, outFile): #This function does a sentence split using NLTK.tokenizer import nltk.data print(">>> Tokenizing using NLTK... output: %s" % outFile) tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') fin = open(inFile) fout = open(outFile, 'w') data = fin.read() data = data.decode("utf-8") fin.close() token = tokenizer.tokenize(data) for i in range(len(token)): fout.write( '%s\n' % token[i].replace('\n', ' ').replace(' ', ' ').encode("utf-8")) fout.close()
import codecs import sys import string import nltk.data tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') data = "" with open(sys.argv[1]) as f: for line in f: line = line.rstrip('\n') data = data + " " + line unicode_string = data.decode('utf-8') final = '\n'.join(tokenizer.tokenize(unicode_string)) print final.encode('utf-8')
final_prob = prob_matrix[prob_matrix[:, 1].argsort()] final_prob = final_prob[::-1] ###################################################################################### ############################ Find average sentence length ############################ import nltk.data from nltk.tokenize import RegexpTokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open("gess310.txt") data = fp.read() sentences = '\n-----\n'.join(tokenizer.tokenize(data.decode('utf-8'))) tot_sentences = sentences.count("-----") word_tokenizer = RegexpTokenizer(r'\w+') words = word_tokenizer.tokenize(data) tot_words = len(words) avg_sent_length = tot_words / tot_sentences ####################################################################################### ############################# Find average word length ################################ ##Average Sentence Length Calculation Module #BEGIN
import nltk.data from nltk.tokenize import TweetTokenizer import csv, pymorphy2 twtk = TweetTokenizer(preserve_case=False, strip_handles=True) morph = pymorphy2.MorphAnalyzer(lang='uk') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = [] locations = set() with open("../data/Боргардт_-_Аналітична_історія_України.txt", "r", encoding ='utf') as file: data = file.read() #print(data) data = data.decode("utf-8") sentences.extend(tokenizer.tokenize(data)) with open("../dictiponaries/locations_analytistoriya.txt", "r") as file: locations.update([word.strip() for word in file.readlines()]) dict = {} for sent in sentences: label = "" for word in [morph.parse(w)[0].normal_form for w in twtk.tokenize(sent) if w.isalpha()]: if word.strip() in locations: label += "1" else: label += "0" dict[sent] = label