Example #1
0
def load_db():
    
    print "Reading raw data..."
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open('db_train')
    data = fp.read()

    print "Tokenizing..."
    sentences =  tokenizer.tokenize(data.decode('utf8'))
    sentences = ["%s %s %s" % (etc.sentence_start_token, x, etc.sentence_end_token) for x in sentences]
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    
    print "Vocabulary building..."
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

    # Get the most common words and build index_to_word and word_to_index vectors
    vocab = word_freq.most_common(etc.voca_size-1)
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(etc.unknown_token)
    word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
     
    # Replace all words not in our vocabulary with the unknown token
    for i, sent in enumerate(tokenized_sentences):
        tokenized_sentences[i] = [w if w in word_to_index else etc.unknown_token for w in sent]
     
     
    # Create the training data
    x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
    y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

    return x_train, y_train, index_to_word, word_to_index
Example #2
0
def pull_sentences(filename):
    """
        Breaks abstract into sentences
        """
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(filename)
    data = fp.read()
    return tokenizer.tokenize(data.decode('utf-8'))
def docsplitter(Document):
    SourceTextArray = []
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(Document)
    data = fp.read()
    data = data.decode('utf-8')
    SourceTextArray.append(tokenizer.tokenize(data))
    SourceTextArray = SourceTextArray[0]
    return SourceTextArray
Example #4
0
def downloadfile(url, debug = False):
    data = download(url)
    try:
        if url.endswith('.txt'):
            return data.decode('utf-8')
        elif url.endswith('.pdf'):
            return decodepdf(BytesIO(data), debug=debug)
    except KeyboardInterrupt:
        raise
    except:
        pass
Example #5
0
def sentenceSplitNLTK(inFile, outFile):
    #This function does a sentence split using NLTK.tokenizer
    import nltk.data
    print(">>> Tokenizing using NLTK... output: %s" % outFile)
    tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    fin = open(inFile)
    fout = open(outFile, 'w')
    data = fin.read()
    data = data.decode("utf-8")
    fin.close()
    token = tokenizer.tokenize(data)

    for i in range(len(token)):
        fout.write(
            '%s\n' %
            token[i].replace('\n', ' ').replace('  ', ' ').encode("utf-8"))

    fout.close()
import codecs
import sys
import string
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

data = ""
with open(sys.argv[1]) as f:
  for line in f:
    line = line.rstrip('\n')
    data = data + " " + line

unicode_string = data.decode('utf-8')
final =  '\n'.join(tokenizer.tokenize(unicode_string))
print final.encode('utf-8')
Example #7
0
final_prob = prob_matrix[prob_matrix[:, 1].argsort()]
final_prob = final_prob[::-1]

######################################################################################

############################ Find average sentence length ############################

import nltk.data
from nltk.tokenize import RegexpTokenizer

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
fp = open("gess310.txt")
data = fp.read()

sentences = '\n-----\n'.join(tokenizer.tokenize(data.decode('utf-8')))

tot_sentences = sentences.count("-----")

word_tokenizer = RegexpTokenizer(r'\w+')
words = word_tokenizer.tokenize(data)
tot_words = len(words)

avg_sent_length = tot_words / tot_sentences

#######################################################################################

############################# Find average word length ################################

##Average Sentence Length Calculation Module
#BEGIN
Example #8
0
import codecs
import sys
import string
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

data = ""
with open(sys.argv[1]) as f:
    for line in f:
        line = line.rstrip('\n')
        data = data + " " + line

unicode_string = data.decode('utf-8')
final = '\n'.join(tokenizer.tokenize(unicode_string))
print final.encode('utf-8')
Example #9
0
import nltk.data
from nltk.tokenize import TweetTokenizer
import csv, pymorphy2

twtk = TweetTokenizer(preserve_case=False, strip_handles=True)
morph = pymorphy2.MorphAnalyzer(lang='uk')


tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = []
locations = set()

with open("../data/Боргардт_-_Аналітична_історія_України.txt", "r", encoding ='utf') as file:
    data = file.read()
    #print(data)
    data = data.decode("utf-8")
    sentences.extend(tokenizer.tokenize(data))

with open("../dictiponaries/locations_analytistoriya.txt", "r") as file:
    locations.update([word.strip() for word in file.readlines()])

dict = {}

for sent in sentences:
    label = ""
    for word in [morph.parse(w)[0].normal_form for w in twtk.tokenize(sent) if w.isalpha()]:
        if word.strip() in locations:
            label += "1"
        else:
            label += "0"
    dict[sent] = label