Esempio n. 1
0
def trigramModel(corpus):
    newcorpus = PlaintextCorpusReader(corpus, "nlp_project2_corpus.txt")

    newcorpus.raw("nlp_project2_corpus.txt")
    newcorpus.sents("nlp_project2_corpus.txt")
    enwords = newcorpus.words("nlp_project2_corpus.txt")
    entext = newcorpus.raw("nlp_project2_corpus.txt")
    entokens = nltk.word_tokenize(entext)
    # Applying trigram to sentence
    trigram = nltk.trigrams(entokens)

    trigrams_freq = nltk.FreqDist(trigram)
    ourTextArr2 = []
    counter = 0
    prob = 0
    trigramCounter = 0
    probBiGram = 0

    bigrams = nltk.bigrams(entokens)

    bigrams_freq = nltk.FreqDist(bigrams)

    ourTextArr = []
    bigramCounter = 0
    for i in bigrams_freq.most_common():
        bigramCounter += 1

    for i in trigrams_freq.most_common():
        trigramCounter += 1

    for i, j in trigrams_freq.most_common():

        if prob > 0.50:
            print("********PROBB****: ", prob)
        if (j > 0):

            for k, l in bigrams_freq.most_common():
                if (j > 2):
                    probBiGram += l / (bigramCounter / 10)

            prob += j / (trigramCounter / 10)
        prob = ((prob + probBiGram) - (prob * probBiGram)) / trigramCounter

        if prob > 0.45:
            str1 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[0])
            str2 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[1])
            str3 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[2])
            ourTextArr2.append(str1 + " " + str2 + " " + str3)
            if (len(ourTextArr2) > 200):
                break
    ourTextArr2 = list(set(ourTextArr2))
    finalText2 = ""
    counter3 = 0
    ourTextArr2.reverse()

    for i in range(len(ourTextArr2)):
        counter3 += 1
        finalText2 += " " + ourTextArr2[i]
    print(finalText2)
Esempio n. 2
0
def Get_Corpus(Text):
    #reading
    from nltk.corpus import PlaintextCorpusReader
    corpus_read = PlaintextCorpusReader('.', '.*\.txt')
    Part = corpus_read.raw(Text)
    Part_file = corpus_read.fileids()[0]
    Part_string = corpus_read.raw(Part_file)
    return Part_string
Esempio n. 3
0
def train_computer_science(save):
    comp_sci_corpus = PlaintextCorpusReader('{}/corpus/computerscience/'
                                            .format(os.path.dirname(os.path.abspath(__file__))), '.*')

    comp_sci_chunker = Chunker('computerscience', comp_sci_corpus.raw('train.txt'))
    chunk_score = comp_sci_chunker.evaluate(comp_sci_corpus.raw('test.txt'))

    print_chunk_score(chunk_score)

    if save:
        comp_sci_chunker.save_chunker()
Esempio n. 4
0
def train_computer_science(save):
    comp_sci_corpus = PlaintextCorpusReader(
        '{}/corpus/computerscience/'.format(
            os.path.dirname(os.path.abspath(__file__))), '.*')

    comp_sci_chunker = Chunker('computerscience',
                               comp_sci_corpus.raw('train.txt'))
    chunk_score = comp_sci_chunker.evaluate(comp_sci_corpus.raw('test.txt'))

    print_chunk_score(chunk_score)

    if save:
        comp_sci_chunker.save_chunker()
Esempio n. 5
0
    def extractWordsOnly(self, article):
        templist = []
        listtextstring = []
        articlename = article + '.txt'
        #corpus_root = '/home/jesal/onedump/'
        wl = PlaintextCorpusReader(corpus_root, '.*')
        allwords = wl.words(fileids = articlename)
        exturllist = self.extractexternalURL(article)
        textstring = wl.raw(articlename)
        for item in exturllist:
            textstring = textstring.replace(item,' ')
    

        
        #templist = re.sub(r'[.!,;?]', ' ', textstring).split()
        templist = nltk.word_tokenize(textstring)
        listtemp = []
        for i in templist:
        	j = re.sub('[^A-Za-z]+', '', i)
        	listtemp.append(str(j))
		    
		    
		    
		    
        templistfinal = []
        templistfinal= self.removeEmpty(listtemp)
        return templistfinal
 def tokenize_report_sents(self, report_of_the_time):
     re = ReportEnviroments()
     new_corpus_reports_fileids_list = PlaintextCorpusReader(re.original_reports_corpus_path, '.*')
     raw_text = new_corpus_reports_fileids_list.raw(report_of_the_time)
     sentencas_raw = sent_tokenize(raw_text)
     original_report_path = str(new_corpus_reports_fileids_list.abspath(report_of_the_time))
     return sentencas_raw, original_report_path, report_of_the_time
Esempio n. 7
0
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
 def extract_related_terms(self):
     re = ReportEnviroments()
     new_corpus_clusters_fileids_list = PlaintextCorpusReader(re.cluster_corpus_path, '.*')
     raw_text_list = []
     for i in range(len(new_corpus_clusters_fileids_list.fileids())):
         raw_text_list.extend([[new_corpus_clusters_fileids_list.raw(fileids=new_corpus_clusters_fileids_list.fileids()[i])]])
     return raw_text_list
Esempio n. 9
0
def diccionario_bigramLetras():
    # Lectura y transformación de Corpus
    corpus = PlaintextCorpusReader("Corpus", '.*')
    corpus = re.sub("([^a-zA-Záéíóúñ\n ])", "", corpus.raw().lower())
    corpus = re.sub("([\n])", " ", corpus)

    # Se cuenta cuantas veces aparece en el corpus los pares de letras (incluido espacios)
    bigrams = Counter(x + y for x, y in zip(*[corpus[i:] for i in range(2)]))

    dict_bigrams = {}
    for b in bigrams:
        # A cada par de letras, mientras que no estén vacios, se le traduce su segunda letra a su correspondiente numérico
        if b[0] != " " and b[1] != " ":
            #b_tr será a clave del diccionario
            b_tr = b[0] + traducciones.traduce_numerico(b[1])
            try:
                #Si la clave existe, mira que la frecuencia que está en el diccionario sea más pequeña a la frecuencia del
                #nuevo valor, y si es así, se actualiza por este nuevo valor (Se queda con la frecuencia más alta y su traducción)
                if dict_bigrams[b_tr][1] < bigrams[b]:
                    dict_bigrams[b_tr] = [b, bigrams[b]]
            except:
                #Si no está esa clave se añade
                dict_bigrams[b_tr] = [b, bigrams[b]]

    return dict_bigrams
Esempio n. 10
0
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
Esempio n. 11
0
def diccionario_unigramLetras():
    # Lectura y transformación de Corpus
    corpus = PlaintextCorpusReader("Corpus", '.*')
    tokenizer = RegexpTokenizer(r'[a-zA-Záéíóúñ]+')
    tokens = tokenizer.tokenize(corpus.raw())

    frecuenciaLetras = {}
    diccionario = {}

    #Se crea un diccionario que tiene como clave la letra y como valor la frecuencia de esa letra en el corpus
    for lineas in tokens:
        for letra in lineas:
            if letra in list(frecuenciaLetras.keys()):
                count = frecuenciaLetras.get(letra)
                frecuenciaLetras[letra] = count + 1
            else:
                frecuenciaLetras[letra] = 1

    #Se ordena el diccionario por los valores de mayor a menor. Esto de vuelve un tupla
    frecuenciaLetras = sorted(frecuenciaLetras.items(), key=operator.itemgetter(1), reverse=1)

    #Se recorre la tupla, transformando el primer elemento (la letra) a su numérico
    for letra_frec in frecuenciaLetras:
        numerico = traducciones.traduce_numerico(letra_frec[0])
        numerico = int(numerico)

        #Si el numérico ya está en el diccionario se actualiza su valor y si no está se crea
        if numerico in list(diccionario.keys()):
            diccionario.get(numerico).append([letra_frec[0], letra_frec[1]])
        else:
            diccionario[numerico] = [[letra_frec[0], letra_frec[1]]]

    return diccionario
Esempio n. 12
0
def extractTopPOS(location, descriptionLocation):
    CEcorpus_root = location

    CEWordLists = PlaintextCorpusReader(CEcorpus_root, ".*\.txt")
    CE = CEWordLists.raw()  # read all texts, CE = string of all texts

    extractTxt = extractTxtArticle(CE)  #list of lines separated by \n
    #print "this is the CE",  CE
    if len(extractTxt) == 0:
        extractTxt = CE
    CE = "".join(extractTxt)  # CE now is a string with all lines of all texts

    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    CEsents = sent_tokenizer.tokenize(
        CE)  # CEsents= list of sentences(strings)

    listOfAllTags = getOurTagCorpus(CEsents)  # list of tuples (word, tag)

    allNouns = getNouns(listOfAllTags)  # allNouns= list of all nouns
    posSum = getPOS(allNouns)  #posSum= list of top stop-word free nouns

    nounsExceptNN_NC = getNounsExceptNN_NC(
        listOfAllTags)  # nounsExceptNN_NC= list of all nouns
    topNounsExceptNN_NC = getPOS(
        nounsExceptNN_NC
    )  # topNounsExceptNN_NC= list of top stop-word free nouns

    allVerbs = getVerbs(listOfAllTags)  # allNouns= list of all nouns
    topVerbs = getPOS(allVerbs)  #posSum= list of top 10 stop-word free nouns
    topAdjs = getAdjs(listOfAllTags)
    print listOfAllTags
    #print "These are the list of top nouns in the ", descriptionLocation, '\n', len(posSum)

    #print "These are the list of top nouns in the class event corpus\n", len(posSum)

    print "These are the list of top verbs in the ", descriptionLocation, "Corpus \n", len(
        topVerbs)
    for x in topVerbs:
        print x

    print "These are the list of top adjectives in the class event corpus\n", len(
        topAdjs)
    for x in topAdjs:
        print x

    print 'Printing top (stop-word free) nouns '
    for x in posSum:
        print x

    print "These are the list of top nouns EXCEPT NN-NC in the", descriptionLocation, "\n", len(
        topNounsExceptNN_NC)

    for x in topNounsExceptNN_NC:
        print x

    print "These are the list of top adjectives in the class event corpus\n", len(
        topAdjs)
    for x in topAdjs:
        print x
Esempio n. 13
0
def loading_corpus():
    from nltk.corpus import PlaintextCorpusReader
    corpus_root = "wiki_corpus"
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    print(wordlists.fileids())
    print(wordlists.words())
    tokens = nltk.word_tokenize(wordlists.raw())
    print(tokens)
Esempio n. 14
0
def upload_corpus(root, name):
    corpora = PlaintextCorpusReader(root, [name])
    corpus_raw = corpora.raw(name)
    # if unicode
    # may have problems
    corpus_string = corpus_raw.encode('unicode-escape').decode('string_escape')
    corpus_text_norm = replace_newLine(corpus_string)
    return corpus_text_norm
Esempio n. 15
0
def GetTweets():
    corpusdir = 'DB/'

    newCorpus = PlaintextCorpusReader(corpusdir, '.*\.txt$') #Regex allows you to ignore .DS_Store

    pattern = '\r\n' #Regex accepts \r\n as the next line encoding in each 'tweet' in the database
    tweets = nltk.regexp_tokenize(newCorpus.raw(), pattern, gaps=True) #iterate through list, creating 'tweets'
    tweets = [x.lower() for x in tweets] #make all strings lowercase to make matching easier
    return tweets
Esempio n. 16
0
def raw():
	"""
		Returns raw text of corpus
		
		>>> raw()[:54]
		'#                                                 DELM'
	"""		
	wordlists = PlaintextCorpusReader(bijankhan_root, bijankhan_fileid)
	return wordlists.raw(bijankhan_fileid)
Esempio n. 17
0
def process_nan():
    corpus_root = '../nan_samples/'
    library = PlaintextCorpusReader(corpus_root, '.*', encoding='utf-8')
    tokens = nltk.word_tokenize(library.raw())
    tokens = map(lambda x: process_element(x), tokens)
    nan_tokens=[]
    for i in tokens:
        nan_tokens+=i.split(' ')
    return nan_tokens
Esempio n. 18
0
	def raw(self, fileid):
		"""
			Returns raw text of fileid
			
			>>> hr.raw('1996/HAM2-960622.xml')[:38]
			'<?xml version="1.0" encoding="UTF-8"?>'
		"""		
		wordlists = PlaintextCorpusReader(self.hamshahri_root, fileid)
		return wordlists.raw(fileid)
Esempio n. 19
0
def read_text(path):
    if os.path.isfile(path) == True:
        txt = open(path, 'r').read()

    elif os.path.isdir(path) == True:
        filelists = PlaintextCorpusReader(path, '.*.mrg')
        txt = filelists.raw()

    return txt
Esempio n. 20
0
    def __init__(self, path):

        if os.path.isfile(path):
            fh = open(path, 'r')
            self.rawText = Text(word_tokenize(fh))

        elif os.path.isdir(path):
            corpus = PlaintextCorpusReader(path, '.*.mrg')
            self.rawText = Text(nltk.word_tokenize(corpus.raw()))
Esempio n. 21
0
def get_all_text(file_path):
    all_file = PlaintextCorpusReader(file_path, '.*')
    all_file_list = all_file.fileids()
    all_text_list = []
    for i in range(len(all_file_list)):
        text = all_file.raw(all_file_list[i])
        # text normalization - replace '\r' and '\n'
        text_string_norm = replace_newLine(text)
        all_text_list = all_text_list + [text_string_norm]
    return all_text_list
Esempio n. 22
0
def unigramModel(corpus):
    print(corpus)

    # Creating a new corpus
    newcorpus = PlaintextCorpusReader(os.path.abspath(corpus),
                                      "nlp_project2_corpus.txt")
    newcorpus.raw("nlp_project2_corpus.txt")
    newcorpus.sents("nlp_project2_corpus.txt")
    # Getting Enwords from created corpus
    enwords = newcorpus.words("nlp_project2_corpus.txt")
    entext = newcorpus.raw("nlp_project2_corpus.txt")
    entokens = nltk.word_tokenize(entext)
    unigram = entokens
    enmodel = nltk.Text(word.lower() for word in enwords)
    unigram_freq = nltk.FreqDist(unigram)
    unigramCounter = 0
    ourTextArray = []
    unigramProb = 0

    #Creating unigram counter to find the total number of common words
    for l in unigram_freq.most_common():
        unigramCounter += 1
    # Iterating the most common sentences and appyling probabilities
    for i, j in unigram_freq.most_common():

        # Finding the probabilities
        unigramProb += (j / (unigramCounter)) / 10

        # The probability is higher than 0.80 then add that word into array
        if (unigramProb > 0.80):

            str1 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i)
            ourTextArray.append(str1)
            if (len(ourTextArray) > 200):
                break
    # Creating and printing a new 300 word text using the unigram language model
    ourTextArray = list(set(ourTextArray))
    finalTextim = ""
    for i in range(len(ourTextArray)):
        finalTextim += " " + ourTextArray[i]
    print(finalTextim)

    entagged = nltk.pos_tag(entokens)
Esempio n. 23
0
def READ_INT(parameters):
    "use the root and read files and make a list of that"
    corpus_root = parameters  # Mac users should leave out C:
    corpus = PlaintextCorpusReader(corpus_root, '.*txt')  #
    doc = pd.DataFrame(columns=['string_values'])
    for filename in corpus.fileids():
        value1 = corpus.raw(filename)
        doc = doc.append({'string_values': value1}, ignore_index=True)
    docs = doc.values.tolist()
    return [docs]
Esempio n. 24
0
 def __init__(self, path, name=None):
     """Takes a file path, which is assumed to point to a file or a directory, 
     extracts and stores the raw text and also stores an instance of nltk.text.Text."""
     self.name = name
     if os.path.isfile(path):
         self.raw = open(path).read()
     elif os.path.isdir(path):
         corpus = PlaintextCorpusReader(path, '.*.mrg')
         self.raw = corpus.raw()
     self.text = nltk.text.Text(nltk.word_tokenize(self.raw))
Esempio n. 25
0
def big_event_sentences():
	corpus_root = '../Brazil_NightClub_Fire/'

	wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')

	BigEvent = wordlists.raw()

	sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')

	BigEventSentences = sent_tokenizer.tokenize(BigEvent)
	return BigEventSentences
Esempio n. 26
0
def upload_corpus(root):
    norm_text = []
    text_len = []
    corpora = PlaintextCorpusReader(root, '.*')
    all_text = corpora.fileids()
    for i in range(len(all_text)):
        text_raw = corpora.raw(all_text[i])
        token = token_norm(text_raw)
        norm_text = norm_text + [token]
        text_len = text_len + [len(token)]
    return all_text, norm_text, text_len
Esempio n. 27
0
def small_event_sentences():
	corpus_root = '../Texas_Wild_Fire/'

	wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')

	SmallEvent = wordlists.raw()

	sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')

	SmallEventSentences = sent_tokenizer.tokenize(SmallEvent)
	return SmallEventSentences
Esempio n. 28
0
def read_corpus(corpus_root):
    corpus = PlaintextCorpusReader(corpus_root, '.*')

    corpus_titles = []
    corpus_docs = []

    for titles in corpus.fileids():
        corpus_titles.append(titles)
        corpus_docs.append(corpus.raw(titles))

    return corpus_docs
Esempio n. 29
0
def read_text(path):
    """Takes a file path, which is assumed to point to a file or a directory,
    and returns a Text instance."""
    if os.path.isfile(path):
        with open(path) as fh:
            return Text(nltk.word_tokenize(fh.read()))
    elif os.path.isdir(path):
        # restrict to files with the mrg extension, avoiding hidden files like .DS_Store
        # that can cause trouble
        corpus = PlaintextCorpusReader(path, '.*.mrg')
        return Text(nltk.word_tokenize(corpus.raw()))
Esempio n. 30
0
def class_event_sentences():
	corpus_root = '../Islip13Rain/'

	wordlists = PlaintextCorpusReader(corpus_root, ".*\.txt")

	ClassEvent = wordlists.raw()

	sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')

	ClassEventSentences = sent_tokenizer.tokenize(ClassEvent)
	return ClassEventSentences
 def extractexternalURL(self, article):
     #corpus_root = '/home/jesal/onedump/'
     wl = PlaintextCorpusReader(corpus_root, '.*')
     #tempww = wl.words(fileids = article)
     articlename = article + '.txt'
     rawopen = wl.raw(articlename)
     lines = rawopen.splitlines()
     txt = rawopen
     listfinal = []
     #rg = re.compile('http..(?:\\/[\\w\\.\\-]+)+',re.IGNORECASE|re.DOTALL)
     listfinal = re.findall(rg, rawopen)
     return listfinal
Esempio n. 32
0
def carga():
    client = pymongo.MongoClient(MONGODB_URI)
    db = client.docs
    docs=db.SIMILITUD

    completo=[]
    newcorpus = PlaintextCorpusReader(corpus_root, '.*')
    result={}
    for fileid in newcorpus.fileids():
        for file2 in newcorpus.fileids():
            result= {"f1": fileid, "f2":file2, "value": compare_texts(newcorpus.raw(fileid), newcorpus.raw(file2))}
            docs.insert_one(result).inserted_id
Esempio n. 33
0
 def extractexternalURL(self, article):
     #corpus_root = '/home/jesal/onedump/'
     wl = PlaintextCorpusReader(corpus_root, '.*')
     #tempww = wl.words(fileids = article)
     articlename = article + '.txt'
     rawopen = wl.raw(articlename)
     lines = rawopen.splitlines()
     txt = rawopen
     listfinal = []
     #rg = re.compile('http..(?:\\/[\\w\\.\\-]+)+',re.IGNORECASE|re.DOTALL)
     listfinal = re.findall(rg,rawopen)
     return listfinal
Esempio n. 34
0
def bigramModel(corpus):
    newcorpus = PlaintextCorpusReader(corpus, "nlp_project2_corpus.txt")

    newcorpus.raw("nlp_project2_corpus.txt")
    newcorpus.sents("nlp_project2_corpus.txt")
    enwords = newcorpus.words("nlp_project2_corpus.txt")
    entext = newcorpus.raw("nlp_project2_corpus.txt")
    entokens = nltk.word_tokenize(entext)
    # Applying bigram to sentence
    bigrams = nltk.bigrams(entokens)

    # With FreqDist we look at word frequency
    bigrams_freq = nltk.FreqDist(bigrams)

    ourTextArr = []
    bigramCounter = 0
    for i in bigrams_freq.most_common():
        bigramCounter += 1

    prob = 0
    for i, j in bigrams_freq.most_common():

        if (j > 2):

            prob += j / (bigramCounter / 10)
        if prob > 0.9:
            str1 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[0])
            str2 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[1])
            ourTextArr.append(str1 + " " + str2)
            if len(ourTextArr) > 200:
                break
    ourTextArr = list(set(ourTextArr))

    finalText = ""

    ourTextArr.reverse()
    # Creating and printing a new 300 word text using the bigram language model
    for i in range(len(ourTextArr)):
        finalText += " " + ourTextArr[i]
    print(finalText)
Esempio n. 35
0
def extractTopPOS(location, descriptionLocation):
    CEcorpus_root = location

    CEWordLists = PlaintextCorpusReader(CEcorpus_root, ".*\.txt")
    CE = CEWordLists.raw()  # read all texts, CE = string of all texts

    extractTxt = extractTxtArticle(CE)  #list of lines separated by \n
    #print "this is the CE",  CE
    if len(extractTxt) == 0:
        extractTxt = CE
    CE = "".join(extractTxt)  # CE now is a string with all lines of all texts

    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    CEsents = sent_tokenizer.tokenize(CE)  # CEsents= list of sentences(strings)

    listOfAllTags = getOurTagCorpus(CEsents)  # list of tuples (word, tag)

    allNouns = getNouns(listOfAllTags)  # allNouns= list of all nouns
    posSum = getPOS(allNouns)  #posSum= list of top stop-word free nouns

    nounsExceptNN_NC = getNounsExceptNN_NC(listOfAllTags)  # nounsExceptNN_NC= list of all nouns
    topNounsExceptNN_NC = getPOS(nounsExceptNN_NC)  # topNounsExceptNN_NC= list of top stop-word free nouns

    allVerbs = getVerbs(listOfAllTags)  # allNouns= list of all nouns
    topVerbs = getPOS(allVerbs)  #posSum= list of top 10 stop-word free nouns
    topAdjs = getAdjs(listOfAllTags)
    print listOfAllTags
    #print "These are the list of top nouns in the ", descriptionLocation, '\n', len(posSum)

    #print "These are the list of top nouns in the class event corpus\n", len(posSum)

    print "These are the list of top verbs in the ", descriptionLocation,  "Corpus \n", len(topVerbs)
    for x in topVerbs:
        print x

    print "These are the list of top adjectives in the class event corpus\n", len(topAdjs)
    for x in topAdjs:
        print x


    print 'Printing top (stop-word free) nouns '
    for x in posSum:
        print x

    print "These are the list of top nouns EXCEPT NN-NC in the", descriptionLocation,  "\n", len(topNounsExceptNN_NC)

    for x in topNounsExceptNN_NC:
        print x

    print "These are the list of top adjectives in the class event corpus\n", len(topAdjs)
    for x in topAdjs:
        print x
Esempio n. 36
0
    def __init__(self, path):
        if os.path.isfile(path):
            with open(path) as fh:
                self.raw_string = fh.read()

                self.tokens = nltk.word_tokenize(self.raw_string)

        elif os.path.isdir(path):
            # restrict to files with the mrg extension, avoiding hidden files like .DS_Store
            # that can cause trouble
            corpus = PlaintextCorpusReader(path, '.*.mrg')
            self.raw_string = corpus.raw()
            self.tokens = nltk.word_tokenize(self.raw_string)
Esempio n. 37
0
def GetTweets():
    corpusdir = 'DB/'

    newCorpus = PlaintextCorpusReader(
        corpusdir, '.*\.txt$')  #Regex allows you to ignore .DS_Store

    pattern = '\r\n'  #Regex accepts \r\n as the next line encoding in each 'tweet' in the database
    tweets = nltk.regexp_tokenize(
        newCorpus.raw(), pattern,
        gaps=True)  #iterate through list, creating 'tweets'
    tweets = [x.lower() for x in tweets
              ]  #make all strings lowercase to make matching easier
    return tweets
Esempio n. 38
0
 def __init__(self, path, name=None):
     self.name = name
     if os.path.isfile(path):
         self.raw = open(path).read()
     elif os.path.isdir(path):
         corpus = PlaintextCorpusReader(path, '.*.mrg')
         self.raw = corpus.raw()
     self.text = nltk.text.Text(nltk.word_tokenize(self.raw))
     self.tagged_words = nltk.pos_tag(self.text)
     self.cfd = nltk.ConditionalFreqDist(
         (w.lower(), tag) for (w, tag) in self.tagged_words)
     self.pos_tags = [val for key, val in self.tagged_words]
     self.words = self.cfd.conditions()
def tokenizeWords(corpus_root):
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    tokenizer = RegexpTokenizer(r'\w+')
    # for fileid in wordlists.fileids():
    #     sentimentText=wordlists.raw(fileid).lower()
    #     tokenizedWords=tokenizer.tokenize(sentimentText)
    #     tokenizedTextWithoutStopWords=removeAllStopWords(tokenizedWords)
    #
    #     print(tokenizedTextWithoutStopWords)
    #     if "positive" in corpus_root:
    #         print("positive documents")
    #         #posfeats.update(word_feats(tokenizedTextWithoutStopWords),'pos')
    #         #posfeats =posfeats+[word_feats(tokenizedTextWithoutStopWords), 'pos']
    #         posfeats[word_feats(tokenizedTextWithoutStopWords)]='pos'
    #
    #     if "negative" in corpus_root:
    #         negfeats.update(word_feats(tokenizedTextWithoutStopWords),'neg')
    if "negative" in corpus_root:
        negfeats = [(word_feats(removeAllStopWords(tokenizer.tokenize(wordlists.raw(f).lower()))), 'neg') for f in wordlists.fileids()]
    if "positive" in corpus_root:
        posfeats = [(word_feats(removeAllStopWords(tokenizer.tokenize(wordlists.raw(f).lower()))), 'pos') for f in wordlists.fileids()]
        print(posfeats)
Esempio n. 40
0
class Documents:
    def __init__(self, root):
        self.files = PlaintextCorpusReader(root, '.*\.txt')
        self.posting = {}
        self.idf = {}
        self.file_length = {}
        self.file_id_names = {}
        self.N = len(self.files.fileids())

    def process(self):
        for idx, file in enumerate(self.files.fileids()):
            print idx
            filename = file.strip('.txt')
            self.file_id_names[idx] = filename
            text = self.files.raw(file)
            words = process(text)
            if settings['phrase_query']:
                raw_words = raw_process(text)
            if words.values():
                self.file_length[idx] = normalization(words.values())
            for word, freq in words.iteritems():
                if self.idf.has_key(word):
                    self.idf[word] += 1
                else:
                    self.idf[word]  = 1

                if not self.posting.has_key(word):
                    self.posting[word] = {}
                if settings['phrase_query']:
                    self.posting[word][idx] = indices(raw_words, word)
                else:
                    self.posting[word][idx] = freq
        for word, idf in self.idf.iteritems():
            self.posting[word]['idf'] = idf

    def dump(self):
        posting_pickle = open('posting.pkl', 'wb')
        for term, value in self.posting.iteritems():
          self.posting[term] = str(value)
        pickle.dump(self.posting, posting_pickle, 2)
        posting_pickle.close()

        length_pickle = open('file_length.pkl', 'wb')
        pickle.dump(self.file_length, length_pickle, 2)
        length_pickle.close()

        file_ids_pickle = open('file_ids.pkl', 'wb')
        pickle.dump(self.file_id_names, file_ids_pickle, 2)
        file_ids_pickle.close()
	def raw(self, fileids=None, sections=None):
		"""
		Returns the raw string data stored in the specified sections/files
		
		:param fileids: single fileid or list of fileids
		:type fileids: single str or list of str
		
		:param sections: single section or list of sections
		:type sections: single str or list of str
		
		:return: conglomeration of raw text from all fileids and sections
		:rtype: str
		"""
		return PlaintextCorpusReader.raw(
			self, self._resolve(fileids, sections))
Esempio n. 42
0
def Get_text(corpus_root = '/release/'):    
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    print "corpus read from " + corpus_root + " ..."
    raw = wordlists.raw()
    print "corpus rawed ..."
    tokens = nltk.word_tokenize(raw)
    print "corpus tokenized ..."
    text = nltk.Text(tokens)
    print "corpus textified ..."        
    simple_md = [word.lower() for word in text if word.isalpha()]
    print "corpus lowercased and alphafied ..."
    simple_md = [word for word in simple_md if word != 'source']
    print "keyword *source* removed ..."
    print "DONE!"
    return simple_md
Esempio n. 43
0
def preprocTrain(corpus, tf_file, vocab_file):
    global MIN_FREQ
    stopwds = stopwords.words('english')

    TF = {} #gets the freq for each token
    filter_TF = {} #get the freq for each token having freq > minFreq
    feature_train = {} #final features for training class. Passed on to write ARFF files
    vocabulary = []
    ctDocs = {}
    totalDocs = 0
    minFreq = MIN_FREQ
    TrainingFiles = {}

    #loading our corpus
    corpus_root=corpus
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    ctDocs = len(wordlists.fileids()) #total no of files in each class
    totalDocs = ctDocs + totalDocs #total no of files
    TrainingFiles = wordlists.fileids() #contains files for each class

    sys.stderr.write("Reading corpus")
    for fileid in wordlists.fileids():
        sys.stderr.write(".")

        raw = wordlists.raw(fileid)
        tokens = nltk.word_tokenize(raw)
        text = nltk.Text(tokens)

        words = [w.lower() for w in text if w.isalnum() and w.lower() not in stopwds and len(w) > 3]
        vocab = set(words)
        words = nltk.Text(words)

        #calculate TF
        TF[fileid] = {fileid:fileid}
        filter_TF[fileid] = {fileid:fileid}
        for token in vocab:
            TF[fileid][token] = freq(token, words)

            if TF[fileid][token] > minFreq:  #min feature freq.
                vocabulary.append(token)
                filter_TF[fileid][token] = tf(TF[fileid][token],words)

    pickle.dump(filter_TF, open(tf_file, "wb"));
    sys.stderr.write("done\nCalculating TF*IDF scores")
    all_vocabulary = list(set(vocabulary))
    pickle.dump(all_vocabulary, open(vocab_file, "wb"));
    #featureIDF = idf(totalDocs,filter_TF,all_vocabulary)
    pprint(TF, stream=sys.stderr)
Esempio n. 44
0
def parseCorpus(corpusdir):    
    # Load the corpus into nltk and store files as separate file ID, which can be accessed by name.
    # to see which files are in the corpus use the command: corpus.fileids()
    corpus = PlaintextCorpusReader(corpusdir, '.*')
    corpusText = corpus.raw()
    corpusText = re.sub('[^a-zA-Z ]', '', corpusText)
    corpusText = re.sub('(?P<endword>[a-z]+)(?P<begword>[A-Z])', '\g<endword> \g<begword>', corpusText)
    corpusText = corpusText.lower()
    # Create Frequency Distributions for bigrams
    # Dem_bigrams = BigramCollocationFinder._ngram_freqdist(Dem_wordlist, 2)

    # Create Frequency Distributions for trigrams
    # Dem_trigrams = BigramCollocationFinder._ngram_freqdist(Dem_wordlist, 3)

#    return word_tokenize(corpusText)
    return [w for w in word_tokenize(corpusText) if w not in stopwords.words('english')]
Esempio n. 45
0
def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('indexing...')
    ''' head into reuters training data directory '''
    corpus = PlaintextCorpusReader(in_dir, '.*')
    file_names_str = corpus.fileids()  # get list of documents
    file_names = sorted(map(int, file_names_str))
    ''' load corpus '''
    postings = defaultdict(set)
    postings['__all__'] = set(file_names)
    tokens = list()
    for fn in file_names:
        content = corpus.raw(str(fn))  # read file content
        words = tokenize(content)  # tokenization: content -> words
        tokens = stemming(words)  # stemming, singularize
        ''' generate dictionary of (key -> token), (value -> set of document IDs) '''
        for token in tokens:
            postings[token].add(fn)  # add tokens to postings dict
    ''' Output dictionary and postings files '''
    # Dictionary file stores all tokens, with their frequency, offset in the postings file, and size(in bytes)
    # Postings file stores the list of document IDs.

    # write postings file
    dictionary = dict()
    with open(out_postings, mode="wb") as postings_file:
        for key, value in postings.items():
            '''
            len(value) := the frequency of the token(i.e. key)
                        = how many times the token appears in all documents
            offset := current writing position of the postings file
            size := the number of characters written in postings file, in terms of this token
            '''
            offset = postings_file.tell()
            # implement evenly placed skip-pointers in the postings lists
            if key == "__all__":
                size = postings_file.write(pickle.dumps(value))
            else:
                value_updated = apply_skippointer(value)
                size = postings_file.write(pickle.dumps(value_updated))
            dictionary[key] = Entry(len(value), offset, size)

    # write dictionary file
    with open(out_dict, mode="wb") as dictionary_file:
        pickle.dump(dictionary, dictionary_file)
def go(corpus, tf_file, minf):
    stopwds    = stopwords.words('english')
    TF         = {}
    wordlists  = PlaintextCorpusReader(corpus, '.*')

    for fileid in wordlists.fileids():
        text  = nltk.Text(nltk.word_tokenize(wordlists.raw(fileid)))
        words = [w.lower() for w in text if w.isalnum() and w.lower() not in stopwds and len(w) > 3]
        l     = float(len(words))

        TF[fileid] = {}
        for token in set(words):
            count = words.count(token)
            if count > minf: TF[fileid][token] = count / l

    pickle.dump(TF, open(tf_file, "wb"));
    fout = open(tf_file + ".csv", "wb")
    for k,v in TF.iteritems(): fout.write(str(k) + "," + str(v) + "\n")
    fout.close()
Esempio n. 47
0
def diccionario_bigramPalabras():
    # Lectura y transformación de Corpus
    corpus = PlaintextCorpusReader("Corpus", '.*')
    tokenizer = RegexpTokenizer(r'[a-zA-Záéíóúñ]+')
    tokens = tokenizer.tokenize(corpus.raw())
    
    # Generación diccionario bigram de palabras + frecuencia
    bigrams_orig = bigrams(tokens)
    fdist = FreqDist(bigrams_orig)
    dict_bigrams = {}
    for b in fdist:
        b_tr = (b[0], traducciones.traduce_numerico(b[1]))
        try:
            if dict_bigrams[b_tr][1] < fdist.get(b):
                dict_bigrams[b_tr] = [b, fdist.get(b)]
        except:
            dict_bigrams[b_tr] = [b, fdist.get(b)]

    return dict_bigrams
Esempio n. 48
0
def getCorpus (root):
    word_lists = PlaintextCorpusReader (root, '.*')
    lang_hash = {}
    for i in word_lists.fileids():
        [language, file_name] = i.split ('/')
        if language not in lang_hash:
            lang_hash [language] = [file_name]
        else:
            lang_hash[language].append (file_name)
    
    word_dic = {}
    for keys in lang_hash:
        file_list = lang_hash[keys]
        set_words = set()
        lis = []
        for lang_file in file_list:
            sentences = sent_tokenize( (word_lists.raw (keys + '/' +lang_file)))
            lis.extend(sentences)
        word_dic[keys] = lis

    return word_dic
Esempio n. 49
0
def getCorpus(root):
    word_lists = PlaintextCorpusReader(root, '.*')
    lang_hash = {}
    for i in word_lists.fileids():
        [language, file_name] = i.split('/')
        if language not in lang_hash:
            lang_hash[language] = [file_name]
        else:
            lang_hash[language].append(file_name)

    word_dic = {}
    for keys in lang_hash:
        file_list = lang_hash[keys]
        set_words = set()
        lis = []
        for lang_file in file_list:
            sentences = sent_tokenize((word_lists.raw(keys + '/' + lang_file)))
            lis.extend(sentences)
        word_dic[keys] = lis

    return word_dic
Esempio n. 50
0
def diccionario_unigramPalabras():
    # Lectura y transformación de Corpus
    corpus = PlaintextCorpusReader("Corpus\\", '.*')
    tokenizer = RegexpTokenizer(r'[a-zA-Záéíóúñ]+')
    tokens = tokenizer.tokenize(corpus.raw())

    # Generación diccionario unigram de palabras + frecuencia
    fdist = FreqDist(tokens)  # Estudio de frecuencia
    list_tokens_num = fdist.most_common()
    dict_unigrams = {}
    for t in list_tokens_num:
        #Se traduce cada palabra a su numerico [numerico, frecuencia]
        t_num = [traducciones.traduce_numerico(t[0]), t[1]]
        try:
            #Si el numérico ya existe en el diccionario se añade la palabra y la frecuencia
            dict_unigrams[(int(t_num[0]))] = dict_unigrams[(int(t_num[0]))] + [[t[0], t[1]]]
        except:
            #Si no existe se crea
            dict_unigrams[int(t_num[0])] = [[t[0], t[1]]]

    return dict_unigrams
Esempio n. 51
0
def carga_mongodb():

    client = pymongo.MongoClient(MONGODB_URI)
    db = client.docs
    docs=db.DOCS
    spanish_stops = set(stopwords.words('spanish'))
    newcorpus = PlaintextCorpusReader(corpus_root, '.*')
    newcorpus.fileids()

    for fileid in newcorpus.fileids():

        try:
            num_words = len(newcorpus.words(fileid))
            words = newcorpus.words(fileid)
            # num_sents = len(newcorpus.sents(fileid))
            # print(newcorpus.raw(fileid))
            #bcf = BigramCollocationFinder.from_words(words)
            #filter_stops = lambda w: len(w) < 3 or w in spanish_stops
            #bcf.apply_word_filter(filter_stops)
            tags_array=vocab_words(newcorpus.raw(fileid))
            tags=tags_array[0]
            tags_vocab=tags_array[1]
            cloud=tags_array[2]
            total_cloud=[]

            for c in cloud:
                reg={}
                reg['word']=c[0]
                reg['total']=c[1]
                total_cloud.append(reg)

            #insertamos el documento
            post = {"nombre":  fileid, "fecha": datetime.datetime.utcnow(), "texto":preparar_texto(newcorpus.raw(fileid)), "tags_vocab":tags_vocab, "tags":tags, "enc":random.randint(1, 50), "pos":random.randint(1, 10), "neg":random.randint(1, 5), "num_words":num_words, "cloud":total_cloud}
            post_id = docs.insert_one(post).inserted_id

        except:
            print("Importacion Fallida:" + fileid)
Esempio n. 52
0
def build_graph(folder, file_pattern):
    corpus_root = os.getcwd() + "/" + folder
    print "Membuka korpus " + folder + " ..."
    word_lists = PlaintextCorpusReader(corpus_root, file_pattern)

    naskah = word_lists.sents()
    filelists = word_lists.fileids()
    teks = tokenize.sent_tokenize(word_lists.raw(fileids=filelists))

    print folder + " memiliki " + str(len(teks)) + ", " + str(len(naskah)) + " kalimat."

    G_result = nx.Graph()
    print "Membangun graf " + folder + " ..."
    for kalimat in naskah:
        kata = kalimat[0]
        prevToken = kata.lower()
        for idx in range(1, len(kalimat)):
            kata = kalimat[idx]
            token = kata.lower()
            if containsLetter(token) and containsLetter(prevToken):
                G_result.add_edge(prevToken, token)
                prevToken = token

    return G_result
Esempio n. 53
0
def loadCustomStopList():
	f = open('Unit4/customStop.txt', 'rb')
	lst = []
	for l in f.readlines():
		lst.append(l.rstrip('\n')) # parses tuple 
	return lst

commonBigrams = loadCommon()


collection = PlaintextCorpusReader("CollectionSmall", '.*')
classevent = PlaintextCorpusReader("Islip13Rain", '.*')

finder = BigramCollocationFinder.from_words(collection.words())

classeventTokens = nltk.wordpunct_tokenize(collection.raw())
classEventBigrams = nltk.bigrams(classeventTokens)
classEventBigram_FD = nltk.FreqDist(classEventBigrams)

collectionTokens = nltk.wordpunct_tokenize(collection.raw())
collectionBigrams = nltk.bigrams(collectionTokens)
collectionBigrams_FD = nltk.FreqDist(collectionBigrams)

collectionCommonBigrams = {}



for bigram in commonBigrams:
	if bigram in collectionBigrams_FD:
		collectionCommonBigrams[bigram] = collectionBigrams_FD[bigram]
Esempio n. 54
0
    print_chunk_score(chunk_score)

    if save:
        comp_sci_chunker.save_chunker()


if __name__ == '__main__':
    if len(sys.argv) > 1:
        if sys.argv[1] == 'comp-sci':
            train_computer_science(True)

    else:
        comp_sci_corpus = PlaintextCorpusReader('{}/corpus/computerscience/'
                                                .format(os.path.dirname(os.path.abspath(__file__))), '.*')

        comp_sci_chunker = Chunker('computerscience', comp_sci_corpus.raw('train.txt'))
        chunk_score = comp_sci_chunker.evaluate(comp_sci_corpus.raw('test.txt'))

        print_chunk_score(chunk_score)

        while True:
            try:

                sentence = raw_input("Please enter a sentence:\n")

                sentence_keywords = comp_sci_keywords.generate_keywords(sentence)

                print parse_sentence(comp_sci_chunker, sentence, sentence_keywords)

            except Exception as e:
                print e
Esempio n. 55
0
def getTexts():
    #Create a list of texts to search

    print("\nExcuse me just a moment while I get all of my books in order...\n")
    
    corpusRoot = '.'
    textList = PlaintextCorpusReader(corpusRoot, r'.*.txt')
    
    
    return [sent_tokenize(nltk.corpus.webtext.raw('overheard.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('austen-emma.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('austen-persuasion.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('austen-sense.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('bryant-stories.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('burgess-busterbrown.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('carroll-alice.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('chesterton-ball.txt')),
            sent_tokenize(nltk.corpus.gutenberg.raw('chesterton-brown.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('chesterton-thursday.txt')), 
            sent_tokenize(nltk.corpus.gutenberg.raw('edgeworth-parents.txt')), 
            sent_tokenize(nltk.corpus.nps_chat.raw()), 
            sent_tokenize(nltk.corpus.brown.raw(categories=['adventure', 'belles_lettres', 'fiction', 'humor',  
                                                          'lore', 'mystery', 'romance', 'science_fiction'])), 
            #sent_tokenize(nltk.corpus.switchboard.raw()),      
            sent_tokenize(textList.raw('corpuses/auto_girls.txt')), 
            sent_tokenize(textList.raw('corpuses/birthday_party.txt')), 
            sent_tokenize(textList.raw('corpuses/bobsey_city.txt')), 
            sent_tokenize(textList.raw('corpuses/bobsey_home.txt')), 
            sent_tokenize(textList.raw('corpuses/bobsey_indoors.txt')), 
            sent_tokenize(textList.raw('corpuses/bobsey_meadow.txt')), 
            sent_tokenize(textList.raw('corpuses/bonnie_charlie.txt')), 
            sent_tokenize(textList.raw('corpuses/boys_and_girls_bookshelf.txt')), 
            sent_tokenize(textList.raw('corpuses/bunny_brown.txt')), 
            sent_tokenize(textList.raw('corpuses/buster_bear.txt')), 
            sent_tokenize(textList.raw('corpuses/charlie_rescue.txt')), 
            sent_tokenize(textList.raw('corpuses/childhoods_favorites.txt')), 
            sent_tokenize(textList.raw('corpuses/childrens_hour.txt')), 
            sent_tokenize(textList.raw('corpuses/childrens_old_favorites.txt')), 
            sent_tokenize(textList.raw('corpuses/child_stories_from_masters.txt')), 
            sent_tokenize(textList.raw('corpuses/curlytops.txt')), 
            sent_tokenize(textList.raw('corpuses/daddy_garden.txt')), 
            sent_tokenize(textList.raw('corpuses/dorothy_dale.txt')),
            sent_tokenize(textList.raw('corpuses/double_dare.txt')), 
            sent_tokenize(textList.raw('corpuses/enchanted_castle.txt')), 
            sent_tokenize(textList.raw('corpuses/errand_boy.txt')), 
            sent_tokenize(textList.raw('corpuses/etheldreda.txt')), 
            sent_tokenize(textList.raw('corpuses/fairy_tales_every_child.txt')), 
            sent_tokenize(textList.raw('corpuses/favorite_stories_every_child.txt')), 
            sent_tokenize(textList.raw('corpuses/friend_smith.txt')), 
            sent_tokenize(textList.raw('corpuses/girl_commune.txt')), 
            sent_tokenize(textList.raw('corpuses/girlhood.txt')), 
            sent_tokenize(textList.raw('corpuses/girl_in_ten_thousand.txt')), 
            sent_tokenize(textList.raw('corpuses/golden_moments.txt')), 
            sent_tokenize(textList.raw('corpuses/henrietta_hen.txt')), 
            sent_tokenize(textList.raw('corpuses/honorable_miss.txt')), 
            sent_tokenize(textList.raw('corpuses/houseful_girls.txt')), 
            sent_tokenize(textList.raw('corpuses/jolly_fellowship.txt')), 
            sent_tokenize(textList.raw('corpuses/jos_boys.txt')), 
            sent_tokenize(textList.raw('corpuses/junior_classics.txt')), 
            sent_tokenize(textList.raw('corpuses/kates_ordeal.txt')), 
            sent_tokenize(textList.raw('corpuses/laugh_and_play.txt')), 
            sent_tokenize(textList.raw('corpuses/lightfoot_deer.txt')), 
            sent_tokenize(textList.raw('corpuses/little_maid.txt')), 
            sent_tokenize(textList.raw('corpuses/little_marian.txt')), 
            sent_tokenize(textList.raw('corpuses/little_mother.txt')), 
            sent_tokenize(textList.raw('corpuses/luckiest_girl.txt')), 
            sent_tokenize(textList.raw('corpuses/magic_pudding.txt')), 
            sent_tokenize(textList.raw('corpuses/maida_shop.txt')), 
            sent_tokenize(textList.raw('corpuses/marjie_busy.txt')), 
            sent_tokenize(textList.raw('corpuses/mary_louse.txt')), 
            sent_tokenize(textList.raw('corpuses/modern_tomboy.txt')), 
            sent_tokenize(textList.raw('corpuses/mrs_quack.txt')), 
            sent_tokenize(textList.raw('corpuses/mystery_putnam.txt')), 
            sent_tokenize(textList.raw('corpuses/navy_girl.txt')), 
            sent_tokenize(textList.raw('corpuses/patty_and_azalea.txt')), 
            sent_tokenize(textList.raw('corpuses/patty_social.txt')), 
            sent_tokenize(textList.raw('corpuses/patty_suitor.txt')), 
            sent_tokenize(textList.raw('corpuses/peck_bad.txt')), 
            sent_tokenize(textList.raw('corpuses/phoenix_and_carpet.txt')), 
            sent_tokenize(textList.raw('corpuses/plebe_year.txt')), 
            sent_tokenize(textList.raw('corpuses/polite_princes.txt')), 
            sent_tokenize(textList.raw('corpuses/polly.txt')), 
            sent_tokenize(textList.raw('corpuses/poor_proud.txt')), 
            sent_tokenize(textList.raw('corpuses/railway_kids.txt')), 
            sent_tokenize(textList.raw('corpuses/rollo_play.txt')), 
            sent_tokenize(textList.raw('corpuses/rusty_wren.txt')), 
            sent_tokenize(textList.raw('corpuses/sailor_girl.txt')), 
            sent_tokenize(textList.raw('corpuses/sara_crewe.txt')), 
            sent_tokenize(textList.raw('corpuses/snowball_lamb.txt')), 
            sent_tokenize(textList.raw('corpuses/sweet_maid.txt')), 
            sent_tokenize(textList.raw('corpuses/ted_and_phone.txt')), 
            sent_tokenize(textList.raw('corpuses/three_towers.txt')), 
            sent_tokenize(textList.raw('corpuses/tim_turtle.txt')), 
            sent_tokenize(textList.raw('corpuses/uncle_ike.txt')), 
            sent_tokenize(textList.raw('corpuses/west_wind.txt')), 
            sent_tokenize(textList.raw('corpuses/white_feather.txt')), 
            sent_tokenize(textList.raw('corpuses/young_folks_treasury.txt')),
            sent_tokenize(textList.raw('corpuses/works_of_fielding.txt')), 
            sent_tokenize(textList.raw('corpuses/useful_phrases.txt')), 
            sent_tokenize(textList.raw('corpuses/tudor_conversation.txt')), 
            sent_tokenize(textList.raw('corpuses/phrases_for_speakers.txt')), 
            sent_tokenize(textList.raw('corpuses/bequest.txt')),
            sent_tokenize(textList.raw('corpuses/english_spoke.txt')), 
            sent_tokenize(textList.raw('corpuses/phrase_book.txt'))
            ]
Esempio n. 56
0
def begin(wordlist): #process the files so I know what was read in
    for fileid in wordlist.fileids():
        num_chars = len(wordlist.raw(fileid))
        num_words = len(wordlist.words(fileid))
        num_sents = len(wordlist.sents(fileid))
        
        # normalize the vocabulary, stem d words using porter stemmer 
        num_vocab = len(set([w.lower() for w in wordlist.words(fileid)]))
        
        #print fileid, int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab)



files = wordlist.fileids()
raw = wordlist.raw(files[5])

## normalize and tokenize
porter = nltk.PorterStemmer()
temp = re.split(r'\W+', raw) ## split the raw text into appropriate words 
##pattern = r'\W+'
##temp = nltk.regexp_tokenize(ld.raw, pattern)

temp = [porter.stem(t) for t in temp]
temp = [t.lower() for t in temp]
## You should also tokenize contractions like didn't to did and not
## Also put numbers that belong together as one unit. For example 9 1/2 should be "9" and "1/2"
## not "9", "1", "2"


## part of speech tagging