Ejemplo n.º 1
0
corpus_root = ('/Users/callab/Documents/Projects/CHILDES/Corpora/')

def remove_non_ascii_1(text):
    return ''.join(i for i in text if ord(i)<128)

#providence = CHILDESCorpusReader(corpus_root, 'childes_corpora/Providence/.*.xml')
childes = CHILDESCorpusReader(corpus_root, '.*.xml')


#Some useful codes. 
#display the files names for Providence
childes.fileids()
#count the number of files
len(childes.fileids())
#printing properties of the corpus files
corpus_data = childes.corpus(childes.fileids())
print(corpus_data[0]['Lang'])
for key in sorted(corpus_data[1].keys()):
    print(key ,":", corpus_data[1][key])
#Printing participant information. CHI (target child), MOT(mother), INV (investigator)
#something is wrong in my print
corpus_participants = childes.participants(childes.fileids())
for this_corpus_participants in corpus_participants[3:5]:
    for key in sorted(this_corpus_participants.keys()):
        dct = this_corpus_participants[key]
        print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
#printing words
childes.words()[0:100]
#printing sentences
childes.sents()[0:100]
#use word stems (e.g., 'is' -> 'be-3PS') instread of the original words.
Ejemplo n.º 2
0
# Iterates through the directory
xml_files = xtract_XML_files(directory_path)

# Creates a CSV file
with open(directory + '_' + file_name + '.csv', 'w') as csvfile:
    fieldnames = ['Corpus', 'File', 'Name', 'Verb', 'Age', 'Sent',
                  'Syntatic Object', 'Event or Object']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for xml_folder in xml_files:

        corpus_folder = CHILDESCorpusReader(xml_folder, '.*.xml')

        # Stores the data of the corpra
        corpus_data = corpus_folder.corpus(corpus_folder.fileids())

        # Prints out corpus & child information
        for file in corpus_folder.fileids():

            # Stores all the sentences spoken by the speaker
            corpus_sents = corpus_folder.sents(file, speaker=speaker)

            # Stores all the sentences, words in stem form
            corpus_sents_stems = corpus_folder.sents(file, speaker=speaker,
                                                stem=True)

            corpus_participant = corpus_folder.participants(file)

            # Searches through each sentence for a match
            for stem_sents, sents in zip(corpus_sents_stems, corpus_sents):
def main():
    nltk_download_dir = '/home/rodriguesfas/nltk_data'

    brown_corpus_root = os.path.join(nltk_download_dir,
                                     'corpora/CHILDES/Eng-NA-MOR/Valian/')

    brown_corpus = CHILDESCorpusReader(root=brown_corpus_root, fileids='.+xml')

    #exibe os arquivos
    print brown_corpus.fileids()

    #conta o numero de arquivos
    print len(brown_corpus.fileids())

    #exibe propriedade dos arquivos
    corpus_data = brown_corpus.corpus(brown_corpus.fileids())
    print(corpus_data[0]['Lang'])

    for key in sorted(corpus_data[0].keys()):
        print(key, ": ", corpus_data[0][key])

    # Imprimindo informações dos participantes do corpus.
    # Os códigos mais comuns para os participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador).
    corpus_participants = brown_corpus.participants(brown_corpus.fileids())
    for this_corpus_participants in corpus_participants[:2]:
        for key in sorted(this_corpus_participants.keys()):
            dct = this_corpus_participants[key]
            print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

    # printing words.
    print brown_corpus.words('01a.xml')

    # printing sentences.
    print brown_corpus.sents('01a.xml')

    #You can specify the participants with the argument speaker.
    print brown_corpus.words('01a.xml', speaker=['INV'])
    print brown_corpus.words('01a.xml', speaker=['MOT'])
    print brown_corpus.words('01a.xml', speaker=['CHI'])

    # tagged_words() and tagged_sents() return the usual (word,pos) tuple lists.
    # POS tags in the CHILDES are automatically assigned by MOR and POST programs (MacWhinney, 2000).
    print brown_corpus.tagged_words('01a.xml')[:30]

    print brown_corpus.tagged_sents('01a.xml')[:10]

    # When the argument stem is true, the word stems (e.g., 'is' -> 'be-3PS') are used instread of the original words.
    print brown_corpus.words('01a.xml')[:30]
    print brown_corpus.words('01a.xml', stem=True)[:30]

    #When the argument replace is true, the replaced words are used instread of the original words.
    print brown_corpus.words('01a.xml', speaker='CHI')[247]
    print brown_corpus.words('01a.xml', speaker='CHI', replace=True)[247]

    # When the argument relation is true, the relational relationships in the sentence are returned.
    # See Sagae et al. (2010) for details of the relational structure adopted in the CHILDES.
    print brown_corpus.words('01a.xml', relation=True)[:10]

    # Printing age. When the argument month is true, the age information in the CHILDES format is converted into the number of months.
    print brown_corpus.age()
    print brown_corpus.age('01a.xml')
    print brown_corpus.age('01a.xml', month=True)

    # Printing MLU. The criteria for the MLU computation is broadly based on Brown (1973).
    print brown_corpus.MLU()
    print brown_corpus.MLU('01a.xml')

    # Basic stuff
    # Count the number of words and sentences of each file.

    for this_file in brown_corpus.fileids()[:6]:
        print(
            brown_corpus.corpus(this_file)[0]['Corpus'],
            brown_corpus.corpus(this_file)[0]['Id'])
        print("num of words: %i" % len(brown_corpus.words(this_file)))
        print("num of sents: %i" % len(brown_corpus.sents(this_file)))
Ejemplo n.º 4
0
import nltk
from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find(
    '/home/rodriguesfas/Mestrado/workspace/specana.prototype/dataset/corpora/childes/data/eng-uk/Belfast/'
)

ccr = CHILDESCorpusReader(corpus_root, 'Barbara/.*.xml')

print ccr.fileids()

# Conte o número de arquivos.
print len(ccr.fileids())

# Imprimindo propriedades dos arquivos corpus.
corpus_data = ccr.corpus(ccr.fileids())
print(corpus_data[0]['Lang'])

for key in sorted(corpus_data[0].keys()):
    print(key, ": ", corpus_data[0][key])

# Imprimindo informações dos participantes do corpus. Os códigos mais comuns para os
# participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador).
corpus_participants = ccr.participants(ccr.fileids())

for this_corpus_participants in corpus_participants[:2]:
    for key in sorted(this_corpus_participants.keys()):
        dct = this_corpus_participants[key]
        print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

# printing words.
Ejemplo n.º 5
0
# CHILDES Corpus

- NLTK can deal with the xml format of the CHILDES corpus
- CHILDES xml is available at [https://childes.talkbank.org/data-xml/](https://childes.talkbank.org/data-xml/)


import nltk
from nltk.corpus.reader import CHILDESCorpusReader
r = CHILDESCorpusReader('../../../Corpus/CHILDES/Chinese/Chang1_xml/', '.*.xml')

r.fileids()

# print basic profile for each xml
for f in r.fileids()[:5]:
    cur_corpus = r.corpus(f)[0]
    print(cur_corpus['Corpus'],
          cur_corpus['PID'],
         cur_corpus['ActivityType'],
         cur_corpus['Date'])
    print("Num of Words: {}".format(len(r.words(f))))
    print("Num of Sents: {}".format(len(r.sents(f))))

# participants
r.participants(fileids=r.fileids()[10])[0]# first file participants

all_speakers = r.participants()

for speakers_cur_file in all_speakers[:5]:
    print("====")
    for spid in speakers_cur_file.keys():
        cur_spid_data = speakers_cur_file[spid]