Esempi in Python per CHILDESCorpusReader, esempi in Python per nltk.corpus.reader.CHILDESCorpusReader

Esempio n. 1

0

Mostra file

def main(args):
  corpus = CHILDESCorpusReader(args.dir, args.glob)
  for fileid in corpus.fileids():
    for sentence in corpus.words(fileid, relation=True):
      try:
        print_conllu(sentence, sys.stdout)
      except:
        # Some of the sentences bork because the parses aren't complete. Oh well.
        pass

Esempio n. 2

0

Mostra file

def scandirs(path, d):

    corpus_root = nltk.data.find(path)
    for currentFile in glob.glob(os.path.join(path, '*')):
        if os.path.isdir(currentFile):
            print(currentFile)
            s = sentence_cut(currentFile)  # returns name of kid (directory)
            s += '/.*.xml'
            manchester = CHILDESCorpusReader(corpus_root, s)
            li = manchester.words(speaker='MOT')  #only the parents words
            for i in li:
                if (i != 'xxx'):
                    d[len(i)].append(i)

Esempio n. 3

0

Mostra file

File: verb_summary.py Progetto: G-You/semantic_adaptation

def TextLoader(file_locations, cds):
    verb_counter = {}

    for loc in file_locations:
        dir = re.sub(r"(.+/)[^/]+$", r"\1", loc)
        fileid = re.sub(r".+/([^/]+)$", r"\1", loc)
        corpus = CHILDESCorpusReader(dir, fileid)
        age = corpus.age(fileids=fileid, month=True)
        name = loc.split("/")[-2]
        if name not in verb_counter:
            verb_counter[name] = {}

        if (cds == True):
            spkrs = [spkr for spkr in corpus.participants(fileid)[0].keys() \
                 if spkr != "CHI"]
        else:
            spkrs = ["CHI"]
        #sents = corpus.sents(speaker = spkrs)
        tagged_words = corpus.tagged_words(speaker=spkrs,
                                           stem=True,
                                           replace=True)
        words = [
            word[0] for word in tagged_words
            if (len(word[1]) > 0 and word[1][0] == "v")
        ]

        try:
            age = int(age[0])
            if age not in verb_counter[name]:
                verb_counter[name][age] = Counter(words)
            else:
                verb_counter[name][age] += Counter(words)
        except:
            None
        '''
        for word in words:
            if (word[1][0] == "v"):
                yield(word[0], age[0], name)
                '''
        #cleaned_sent = []
        #for stem in s:
        #stem = re.sub(r'-[^~]+', "", stem)
        #if "~" in stem:
        #cleaned_sent += stem.split("~")
        #else:
        #cleaned_sent.append(stem)

        #yield (" ".join(s), age[0], name)
    return verb_counter

Esempio n. 4

0

Mostra file

def scandirs(path, part_ofspeech, dependencies):
    corpus_root = nltk.data.find(path)
    for currentFile in glob.glob(os.path.join(path, '*')):
        if os.path.isdir(currentFile):
            print(currentFile)
            s = sentence_cut(currentFile)  # returns name of kid (directory)
            s += '/.*.xml'
            print(s)
            manchester = CHILDESCorpusReader(corpus_root, s)
            li = manchester.words(relation=True,
                                  speaker='MOT')  #only the parents words
            for i in li:
                for k in i:
                    if (len(k) >= 3):
                        depen(k, dependencies)
                    partOfSpeech(k, part_ofspeech)

Esempio n. 5

0

Mostra file

def TextLoader(file_locations, cds):
    for loc in file_locations:
        dir = re.sub(r"(.+/)[^/]+$", r"\1", loc)
        fileid = re.sub(r".+/([^/]+)$", r"\1", loc)
        corpus = CHILDESCorpusReader(dir, fileid)
        age = corpus.age(fileids=fileid, month=True)
        name = loc.split("/")[-2]
        if (cds == 1):
            spkrs = [spkr for spkr in corpus.participants(fileid)[0].keys() \
                 if spkr != "CHI"]
        else:
            spkrs = ["CHI"]

        sents = corpus.sents(speaker=spkrs, stem=True, replace=True)
        for s in sents:

            yield (" ".join(s), age[0], name)

Esempio n. 6

0

Mostra file

 def _process_metadata(self):
     '''
     copy the child and speaker metadata dicts to the CHILDESSentence object's 
     internal dictionary, prepending 'child_' and 'speaker_' accordingly
     
     This exposes the child and speaker metadata as object attributes.
     '''
     
     for k, v in self.child_metadata.items():
         if k != 'age':
             self.__dict__['child_'+k] = v
         else:
             self.__dict__['child_'+k] = CHILDESCorpusReader('','').convert_age(v)
     
     for k, v in self.speaker_metadata.items():
         self.__dict__['speaker_'+k] = v

Esempio n. 7

0

Mostra file

#### add overlap presence, run on MPI-EVA-manchester

import csv
import nltk
from nltk.parse import TestGrammar
from nltk.corpus.reader import CHILDESCorpusReader
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
from nltk.util import flatten, LazyMap, LazyConcatenation
from nltk.compat import string_types

NS = 'http://www.talkbank.org/ns/talkbank'

corpus_root = nltk.data.find('corpora/childes/Eng-UK')
manchester_corpus_root = nltk.data.find(
    'corpora/childes/Eng-UK/MPI-EVA-Manchester')
thomas = CHILDESCorpusReader(corpus_root, 'Thomas/.*.xml')
eleanor = CHILDESCorpusReader(manchester_corpus_root, 'eleanor/.*.xml')
fraser = CHILDESCorpusReader(manchester_corpus_root, 'fraser/.*.xml')

corpus_rt_total = 0
corpus_rt_num = 0
corpus_rt_avg = 0
corpus_noerr_rt_total = 0
corpus_noerr_rt_num = 0
corpus_noerr_rt_avg = 0
corpus_err_rt_total = 0
corpus_err_rt_num = 0
corpus_err_rt_avg = 0

corpus_total_errs = 0

Esempio n. 8

0

Mostra file

def get_corpus_reader(language):
    return CHILDESCorpusReader(corpus_root,
                               r'%s.*/.*\.xml' % language[:3].title())

Esempio n. 9

0

Mostra file

File: CHILDES_Stem.py Progetto: Phileodontist/CHILDES_Script

    return file_folder


# Iterates through the directory
xml_files = xtract_XML_files(directory_path)

# Creates a CSV file
with open(directory + '_' + file_name + '.csv', 'w') as csvfile:
    fieldnames = ['Corpus', 'File', 'Name', 'Verb', 'Age', 'Sent',
                  'Syntatic Object', 'Event or Object']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for xml_folder in xml_files:

        corpus_folder = CHILDESCorpusReader(xml_folder, '.*.xml')

        # Stores the data of the corpra
        corpus_data = corpus_folder.corpus(corpus_folder.fileids())

        # Prints out corpus & child information
        for file in corpus_folder.fileids():

            # Stores all the sentences spoken by the speaker
            corpus_sents = corpus_folder.sents(file, speaker=speaker)

            # Stores all the sentences, words in stem form
            corpus_sents_stems = corpus_folder.sents(file, speaker=speaker,
                                                stem=True)

            corpus_participant = corpus_folder.participants(file)

Esempio n. 10

0

Mostra file

"""
The main driver function for data processing, and collecting features.
"""
if __name__ == '__main__':
    t = time.time()  # Initialization
    output = []
    d = cmudict.dict()
    parser = English()

    # get corpus directories
    corpus_root_xml = nltk.data.find(
        'C:\\Users\\James\\PycharmProjects\\FIT3036\\xml')
    corpus_root_plain = 'C:\\Users\\James\\PycharmProjects\\FIT3036\\plain_text'

    # get all xml and plain text files from specified directories
    corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml')
    corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha')

    # get all the words spoken by a child
    all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])]

    # init wordnet and language model
    corpus_ic = wn.ic(corpus_xml, True, 1.0)
    lm = LanguageModel(all_words)

    # collect all the features for each corpus
    for j in range(len(corpus_xml.fileids())):
        current_features = []  # init empty array to store features
        # Text initialization
        text_xml = corpus_xml.fileids()[j]
        text_plain = corpus_plain.fileids()[j]

Esempio n. 11

0

Mostra file

File: sentences_by_age.py Progetto: MWTA/Natural-Language-Processing-Python

def read_files_by_age(file):
    phrase = []
    sentences = []
    canonical_sentences = []
    tuple = []

    unknown = ['xxx', 'www', 'mm']
    ignoredMotPOS = ['chi', 'fam', 'neo']

    with open(path + file) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=';')
        for row in reader:
            tuple.append((row['path']))

    for i1 in tuple:
        source = CHILDESCorpusReader(corpus_root, i1)
        sents_chi_pos = source.tagged_sents(speaker='CHI', replace=True)

        for i2 in sents_chi_pos:
            for j1 in i2:
                if (j1[0] not in unknown) & (j1[0] != '') & (
                        j1[1] not in ignoredMotPOS):
                    phrase.append(j1[0])

            if phrase != []:
                t_sentences = []
                treated_sentence = []
                for i3 in phrase:
                    res = re.search(r"(?!'.*')\b[\w']+\b", i3)
                    t_sentences.append(res.group(0))
                treated_sentence.append(t_sentences)
                sentences.append(t_sentences)

                for i4 in treated_sentence:

                    splited_phrase = []
                    for j2 in i4:
                        rep = replacer.replace(j2)
                        if ' ' in rep:
                            a, b = rep.split(' ')
                            splited_phrase.append(canonicalTag(a))
                            splited_phrase.append(canonicalTag(b))
                        else:
                            splited_phrase.append(canonicalTag(rep))
                    canonical_sentences.append(splited_phrase)
                # sentences = []
            phrase = []

    # target = open('Corpus/ByAge/Original/sentence_' + file.split('.')[0] + '.txt', encoding='utf-8')
    # target_canonical =  open('Corpus/ByAge/Canonical/canonical_sentence_' + file.split('.')[0] + '.txt' , mode='w' , encoding='utf-8')

    # shutil.copyfileobj(source, target)

    target = open(
        'Corpus/Sentences/ByAge/Original/sentence_' + file.split('.')[0] +
        '.txt', 'w')
    target_canonical = open(
        'Corpus/Sentences/ByAge/Canonical/canonical_sentence_' +
        file.split('.')[0] + '.txt', 'w')

    for i5 in sentences:
        for j4 in i5:
            target.write(j4 + ' ')
        target.write('\n')
    target.close()
    print('terminei (original)')

    for i6 in canonical_sentences:
        for j5 in i6:
            target_canonical.write(j5 + ' ')
        target_canonical.write('\n')
    target_canonical.close()
    print('terminei (canonico)')

Esempio n. 12

0

Mostra file

File: processing_corpus.py Progetto: filipegomes/childes

def extraction_sentences():
    '''
        Processes all datasets individually by children.
        Processa o todos os datasets individualmente por crianças.
    '''

    # lists all directories (dataset).
    for index, dataset in enumerate(os.listdir(dir_all_dataset)):
        print '======================================================'
        path_dataset = dir_all_dataset + dataset
        print '> Dataset {} : {} \n'.format(index, dataset)

        # lists all subdirectories (children).
        for dir_child in os.listdir(path_dataset):
            # check if it's a directory.
            if os.path.isdir(os.path.join(path_dataset, dir_child)):
                print '>> Child:', dir_child
                '''
                    Read all .XML files by extracting only child's speech and saving in a new .TXT file.
                    Ler todos os arquivos .XML extraindo somente a fala da criança e salva em um novo arquivo .TXT .
                '''
                path_files_xml = path_dataset + '/' + dir_child + '/xml'
                corpus_root = nltk.data.find(path_files_xml)
                ccr = CHILDESCorpusReader(corpus_root, '/.*.xml')

                file_speaks_child = open(
                    path_dataset + '/' + dir_child + '/' + dir_child.lower() +
                    '_speaks_chi.txt', 'w')

                for file_xml in os.listdir(path_files_xml):
                    if file_xml.endswith('.xml'):
                        age = ccr.age(file_xml)  # get age child.
                        age = str(age)  # coverte em string

                        if age != '[None]':  # check age is None.
                            # model age: P 2Y 4M 9D | P 1Y 11M 29D
                            new_age = []

                            # out = P-1Y-1M-29D-
                            for ch in str(age):
                                new_age.append(ch)
                                if ch.isalpha():
                                    new_age.append('-')

                            new_age = ''.join(new_age)  # split.
                            # clear.
                            new_age = new_age.replace('[', '').replace(
                                ']', '').replace("'", '')

                            # separa as cadeias
                            P, Y = new_age.split('-', 1)

                            if Y[0] == age_child_limit:  # check age child is >= age_child_limit
                                sentences = ccr.sents(file_xml,
                                                      speaker=['CHI'])

                                for sentence in sentences:
                                    try:
                                        file_speaks_child.write(
                                            str(" ".join(sentence) + '\n'))
                                    except:
                                        file_speaks_child.write(
                                            str(" ".join(sentence).encode(
                                                'utf-8') + '\n'))

                file_speaks_child.close()

                print '>> Extracted child speech: ' + dir_child.lower(
                ) + '_speaks_chi.txt'
                '''
                    Create a new file with the sentences that have five or more tokens.
                    Crie um novo arquivo com as frases que tenham cinco ou mais tokens.
                '''
                if dir_child != 'xml':
                    list_sentences_temp = []

                    file_input = dir_child.lower() + '_speaks_chi.txt'
                    file_output = dir_child.lower(
                    ) + '_speaks_chi_selected.txt'

                    with open(path_dataset + '/' + dir_child + '/' +
                              file_input) as document:
                        for line in document:
                            sentence = to_treat_bigram(line.split())
                            sentence = remove_stops_words(sentence.split())
                            sentence = check_sentence(sentence)
                            # spellchecker(sentence.split())

                            if sentence != '' and sentence != None and len(
                                    sentence.split()) >= tokens_limit:
                                # Add ponto final nas sentenças e salva em uma lista temporária.
                                list_sentences_temp.append(
                                    sentence.capitalize() + '.')

                    # Remove sentenças repetidas.
                    list_sentences_temp = set(list_sentences_temp)

                    # Salva as sentenças selecionadas.
                    file_speaks_child_selected = open(
                        path_dataset + '/' + dir_child + '/' + file_output,
                        'w')  # a+
                    for sent in list_sentences_temp:
                        file_speaks_child_selected.write(str(sent) + '\n')
                    file_speaks_child_selected.close()

                    print '>> Selected child speech: ' + dir_child.lower(
                    ) + '_speaks_chi_selected.txt'

                print ''

        print ''

Esempio n. 13

0

Mostra file

# CHILDES Corpus

- NLTK can deal with the xml format of the CHILDES corpus
- CHILDES xml is available at [https://childes.talkbank.org/data-xml/](https://childes.talkbank.org/data-xml/)


import nltk
from nltk.corpus.reader import CHILDESCorpusReader
r = CHILDESCorpusReader('../../../Corpus/CHILDES/Chinese/Chang1_xml/', '.*.xml')

r.fileids()

# print basic profile for each xml
for f in r.fileids()[:5]:
    cur_corpus = r.corpus(f)[0]
    print(cur_corpus['Corpus'],
          cur_corpus['PID'],
         cur_corpus['ActivityType'],
         cur_corpus['Date'])
    print("Num of Words: {}".format(len(r.words(f))))
    print("Num of Sents: {}".format(len(r.sents(f))))

# participants
r.participants(fileids=r.fileids()[10])[0]# first file participants

all_speakers = r.participants()

for speakers_cur_file in all_speakers[:5]:
    print("====")
    for spid in speakers_cur_file.keys():
        cur_spid_data = speakers_cur_file[spid]

Esempio n. 14

0

Mostra file

def main():
    nltk_download_dir = '/home/rodriguesfas/nltk_data'

    brown_corpus_root = os.path.join(nltk_download_dir,
                                     'corpora/CHILDES/Eng-NA-MOR/Valian/')
    brown_corpus = CHILDESCorpusReader(root=brown_corpus_root, fileids='.+xml')

    print brown_corpus.fileids()[:5]

    fileids = ['02b.xml', '03a.xml']

    print brown_corpus.age(fileids=fileids)
    print brown_corpus.MLU(fileids=fileids)

    print {fid: brown_corpus.age(fileids=fid)[0] for fid in fileids}
    print {fid: brown_corpus.MLU(fileids=fid)[0] for fid in fileids}

    print {
        fid: brown_corpus.age(fileids=fid, month=True)[0]
        for fid in fileids
    }

    metadata = brown_corpus.participants(fileids='03a.xml')[0]

    ## comment this if you don't have the pretty package
    print(metadata)

    ## uncomment this if you don't have the pretty package
    #print metadata

    print 'words:', brown_corpus.words(fileids='03a.xml')[:7]
    print 'sents:', brown_corpus.sents(fileids='03a.xml')[:3]

    print 'tagged words:', brown_corpus.tagged_words(fileids='03a.xml')[:7]
    print 'tagged sents:', brown_corpus.tagged_sents(fileids='03a.xml')[:3]

    print "Adam:", '\t', brown_corpus.sents(fileids='03a.xml',
                                            speaker='CHI')[:5]
    print "Mother:", brown_corpus.sents(fileids='03a.xml', speaker='MOT')[:2]

    mother_unstemmed = brown_corpus.sents(fileids='03a.xml', speaker='MOT')

    mother_stemmed = brown_corpus.sents(fileids='03a.xml',
                                        speaker='MOT',
                                        stem=True)
    mother_root = [[
        stemmed_word.split('-')[0] for stemmed_word in stemmed_sent
    ] for stemmed_sent in mother_stemmed]

    print 'Raw:\t\t', mother_unstemmed[:2]
    print 'Stemmed:\t', mother_stemmed[:2]
    print 'Roots only:\t', mother_root[:2]

Esempio n. 15

0

Mostra file

File: pův_childes_nltk.py Progetto: MarcelTkacik/genderStereotypes

import os
import nltk
from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find(
    '/Users/tessacharlesworth/Desktop/Embeddings/Raw Data/CHILDES raw text/'
)  # change to local directory where the raw text files are stored
childes = CHILDESCorpusReader(corpus_root, '\S*.xml')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

file_count = 0
writefile_children = open(
    "/Users/tessacharlesworth/Desktop/Embeddings/Clean Data/CHILDES clean text/corpus_children.txt",
    'w+'
)  # change to local directory where the combined text files should be stored; keep "corpus_children.txt", 'w+'" at the end
for (root, dirs, files) in os.walk(
        "/Users/tessacharlesworth/Desktop/Embeddings/Raw Data/CHILDES raw text/",
        topdown=True
):  # change to local directory where the raw text files are stored
    for file in files:
        if file[-4:] == '.xml':
            print(file)
            output = childes.words(os.path.join(root, file),
                                   speaker=['CHI'],
                                   replace=True)
            output = list(filter(lambda a: a != 'xxxxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxx', output))

Esempio n. 16

0

Mostra file

File: childes-p.py Progetto: estellehe/BLAB_LDA

    'across', 'everything', 'maybe', 'big', 'little', 'nice', 'wow', 'new',
    'cool', 'else', 'ago', 'almost', 'another', 'ahead', 'always', 'already',
    'whoops', 'em', 'wan', 'much', 'nope', 'hum', 'anyways', 'yet', 'though',
    'somethin', 'cha', 'anything', 'somebody', 'may', 'still', 'uhoh', 'also',
    'instead', 'whose', 'without', 'behind', 'anybody', 'any', 'away', 'why',
    'please', 'yay', 'oops', 'any', 'please', 'another', 'something', 'very'
])
#sw = [stemmer.stem(item) for item in sw]

with open('animal.csv', 'rb') as f:
    reader = csv.reader(f)
    animal = []
    for row in reader:
        animal.extend(row)

childes = CHILDESCorpusReader(corpus_root, '.*.xml', lazy=False)
files = childes.fileids()
resultlist = []

for filename in files:
    sents = childes.sents(filename)[0]
    filew = []
    for sent in sents:
        result_lower = [item.lower() for item in sent]
        #result_stem = [stemmer.stem(item) for item in result_lower]
        result_clean = [
            item for item in result_lower
            if '\'' not in item and '_' not in item and len(item) > 1
        ]
        result = [item for item in result_clean if item not in sw]
        filew.extend(result)

Esempio n. 17

0

Mostra file

#### add overlap presence, run on MPI-EVA-manchester

import csv
import nltk
from nltk.parse import TestGrammar
from nltk.corpus.reader import CHILDESCorpusReader
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
from nltk.util import flatten, LazyMap, LazyConcatenation
from nltk.compat import string_types

NS = 'http://www.talkbank.org/ns/talkbank'

corpus_root = nltk.data.find('corpora/childes/Eng-UK')
manchester_corpus_root = nltk.data.find(
    'corpora/childes/Eng-UK/MPI-EVA-Manchester')
eleanor = CHILDESCorpusReader(manchester_corpus_root, 'eleanor/.*.xml')
fraser = CHILDESCorpusReader(manchester_corpus_root, 'fraser/.*.xml')


def getUtterance(xmlsent):
    utterance = ""
    for word in xmlsent.findall('.//{%s}w' % NS):
        if word.text != None:
            utterance = utterance + " " + word.text
    return utterance


def getRT(s1, s2):
    s1_media = s1.find('.//{%s}media' % NS)
    s2_media = s2.find('.//{%s}media' % NS)
    if s1_media != None and s2_media != None:

Esempio n. 18

0

Mostra file

File: CHILDES.PY Progetto: systats/Semantic-Networks

#code from http://www.nltk.org/howto/childes.html

import nltk
from nltk.corpus.reader import CHILDESCorpusReader
corpus_root = ('/Users/callab/Documents/Projects/CHILDES/Corpora/')

def remove_non_ascii_1(text):
    return ''.join(i for i in text if ord(i)<128)

#providence = CHILDESCorpusReader(corpus_root, 'childes_corpora/Providence/.*.xml')
childes = CHILDESCorpusReader(corpus_root, '.*.xml')


#Some useful codes. 
#display the files names for Providence
childes.fileids()
#count the number of files
len(childes.fileids())
#printing properties of the corpus files
corpus_data = childes.corpus(childes.fileids())
print(corpus_data[0]['Lang'])
for key in sorted(corpus_data[1].keys()):
    print(key ,":", corpus_data[1][key])
#Printing participant information. CHI (target child), MOT(mother), INV (investigator)
#something is wrong in my print
corpus_participants = childes.participants(childes.fileids())
for this_corpus_participants in corpus_participants[3:5]:
    for key in sorted(this_corpus_participants.keys()):
        dct = this_corpus_participants[key]
        print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
#printing words

Esempio n. 19

0

Mostra file

File: example-2.py Progetto: MWTA/Natural-Language-Processing-Python

def main():
    nltk_download_dir = '/home/rodriguesfas/nltk_data'

    brown_corpus_root = os.path.join(nltk_download_dir,
                                     'corpora/CHILDES/Eng-NA-MOR/Valian/')

    brown_corpus = CHILDESCorpusReader(root=brown_corpus_root, fileids='.+xml')

    #exibe os arquivos
    print brown_corpus.fileids()

    #conta o numero de arquivos
    print len(brown_corpus.fileids())

    #exibe propriedade dos arquivos
    corpus_data = brown_corpus.corpus(brown_corpus.fileids())
    print(corpus_data[0]['Lang'])

    for key in sorted(corpus_data[0].keys()):
        print(key, ": ", corpus_data[0][key])

    # Imprimindo informações dos participantes do corpus.
    # Os códigos mais comuns para os participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador).
    corpus_participants = brown_corpus.participants(brown_corpus.fileids())
    for this_corpus_participants in corpus_participants[:2]:
        for key in sorted(this_corpus_participants.keys()):
            dct = this_corpus_participants[key]
            print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

    # printing words.
    print brown_corpus.words('01a.xml')

    # printing sentences.
    print brown_corpus.sents('01a.xml')

    #You can specify the participants with the argument speaker.
    print brown_corpus.words('01a.xml', speaker=['INV'])
    print brown_corpus.words('01a.xml', speaker=['MOT'])
    print brown_corpus.words('01a.xml', speaker=['CHI'])

    # tagged_words() and tagged_sents() return the usual (word,pos) tuple lists.
    # POS tags in the CHILDES are automatically assigned by MOR and POST programs (MacWhinney, 2000).
    print brown_corpus.tagged_words('01a.xml')[:30]

    print brown_corpus.tagged_sents('01a.xml')[:10]

    # When the argument stem is true, the word stems (e.g., 'is' -> 'be-3PS') are used instread of the original words.
    print brown_corpus.words('01a.xml')[:30]
    print brown_corpus.words('01a.xml', stem=True)[:30]

    #When the argument replace is true, the replaced words are used instread of the original words.
    print brown_corpus.words('01a.xml', speaker='CHI')[247]
    print brown_corpus.words('01a.xml', speaker='CHI', replace=True)[247]

    # When the argument relation is true, the relational relationships in the sentence are returned.
    # See Sagae et al. (2010) for details of the relational structure adopted in the CHILDES.
    print brown_corpus.words('01a.xml', relation=True)[:10]

    # Printing age. When the argument month is true, the age information in the CHILDES format is converted into the number of months.
    print brown_corpus.age()
    print brown_corpus.age('01a.xml')
    print brown_corpus.age('01a.xml', month=True)

    # Printing MLU. The criteria for the MLU computation is broadly based on Brown (1973).
    print brown_corpus.MLU()
    print brown_corpus.MLU('01a.xml')

    # Basic stuff
    # Count the number of words and sentences of each file.

    for this_file in brown_corpus.fileids()[:6]:
        print(
            brown_corpus.corpus(this_file)[0]['Corpus'],
            brown_corpus.corpus(this_file)[0]['Id'])
        print("num of words: %i" % len(brown_corpus.words(this_file)))
        print("num of sents: %i" % len(brown_corpus.sents(this_file)))

Esempio n. 20

0

Mostra file

File: Corpus_Info.py Progetto: MWTA/Natural-Language-Processing-Python

import nltk
from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-UK-MOR/')

belfast = CHILDESCorpusReader(corpus_root, 'Belfast/.*.xml')
cruttenden = CHILDESCorpusReader(corpus_root, 'Cruttenden/.*.xml')
manchester = CHILDESCorpusReader(corpus_root, 'Manchester/.*.xml')
tommerdahl = CHILDESCorpusReader(corpus_root, 'Tommerdahl/.*.xml')

print(len(belfast.fileids()))
print(len(cruttenden.fileids()))
print(len(manchester.fileids()))
print(len(tommerdahl.fileids()))

Esempio n. 21

0

Mostra file

File: CHILDESCorpusReader.py Progetto: filipegomes/childes

    Date: 29/06/2018
    Author: RodriguesFAS
    Contact: <*****@*****.**>
    Tutorial: http://www.nltk.org/howto/childes.html 
              http://ling-blogs.bu.edu/lx390f17/standoff-annotation-xml-and-more-childes
              
"""
#%%
import nltk
from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find(
    '/home/rodriguesfas/Mestrado/workspace/specana.prototype/dataset/corpora/childes/data/eng-uk/Belfast/'
)

ccr = CHILDESCorpusReader(corpus_root, 'Barbara/.*.xml')

print ccr.fileids()

# Conte o número de arquivos.
print len(ccr.fileids())

# Imprimindo propriedades dos arquivos corpus.
corpus_data = ccr.corpus(ccr.fileids())
print(corpus_data[0]['Lang'])

for key in sorted(corpus_data[0].keys()):
    print(key, ": ", corpus_data[0][key])

# Imprimindo informações dos participantes do corpus. Os códigos mais comuns para os
# participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador).

Esempio n. 22

0

Mostra file

import nltk
import csv

from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find('corpora/CHILDES/data-xml/Eng-UK-MOR/')
reader = CHILDESCorpusReader(corpus_root, '.*.xml')

# TODO: arquivos duplicados

file_age = []


def save_folder_by_age(path, age):
    size = 6
    base = int(age / size)
    with open(
            'Corpus/FolderByAge/' + 'age_' + str(base * size) + '_' +
            str(((base + 1) * size) - 1) + '.csv', 'a') as csvfile:
        fieldnames = ['pathInput', 'age']
        writer = csv.DictWriter(csvfile,
                                fieldnames=fieldnames,
                                delimiter=';',
                                lineterminator='\n')
        writer.writerow({'pathInput': path, 'age': age})


def get_age_in_months(arquivo):
    age = reader.age(arquivo)[0]
    year = 0
    month = 0

Esempio n. 23

0

Mostra file

File: wordseg_prep.py Progetto: muranava/syllabify

import re, csv, os, nltk
import syllable3 as sy
from nltk.corpus import cmudict
from nltk.corpus.reader import CHILDESCorpusReader


## 1) load corpus
# Emma
#corpus_root = nltk.data.find('C:\\Users\\Emma\\AppData\\Roaming\\nltk_data\\corpora\\Brown')
#brown = CHILDESCorpusReader(corpus_root, '.*\\.*.xml')
# Andrew
corpus_root = nltk.data.find('/Users/apc38/Dropbox/workspace/Corpora/CHILDES/xml/BrownXML')
brown = CHILDESCorpusReader(corpus_root, '.*/.*.xml')
fileidlist = brown.fileids()

## 2) make a list of all participants other than children
partislist = brown.participants(fileidlist)
plist = []
patt = re.compile('CHI')
for pdict in partislist:
    for p in pdict.keys():
        if patt.match(p):
            print('ignoring child')
        else:
            print('not a child, this is', p)
            if p not in plist:
                plist.append(p)
                print('added to list, list is now', len(plist), 'items long')

## 3) for each file, get sentences and phoneticize using CMU pronunciation dictionary
transcr = cmudict.dict()