Ejemplo n.º 1
0
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import PreProcessor
import LoadData


def sentence_to_wordlist(raw):
    words=raw.split();
    return words



#get raw data from Previous Task filtered Organised Preprocessed Text
print ('GetttingRawData')
rawText = LoadData._getRawDataFromText()
rawPrepProcessedText = PreProcessor.preProcessData(rawText)
print('Got RawData -ProcessingData')
#Tokenising Raw Data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences=tokenizer.tokenize(rawPrepProcessedText)
sentences=[]

#getting wrd token List tokenised Data
for raw_sentence in raw_sentences:
    if len(raw_sentence)>0:
        sentences.append(sentence_to_wordlist(raw_sentence))

#just temporary to find count no use in code
tokenCount=sum([len(sentence)for sentence in sentences])
print('Token Count----',tokenCount)