corpus_root = ('/Users/callab/Documents/Projects/CHILDES/Corpora/') def remove_non_ascii_1(text): return ''.join(i for i in text if ord(i)<128) #providence = CHILDESCorpusReader(corpus_root, 'childes_corpora/Providence/.*.xml') childes = CHILDESCorpusReader(corpus_root, '.*.xml') #Some useful codes. #display the files names for Providence childes.fileids() #count the number of files len(childes.fileids()) #printing properties of the corpus files corpus_data = childes.corpus(childes.fileids()) print(corpus_data[0]['Lang']) for key in sorted(corpus_data[1].keys()): print(key ,":", corpus_data[1][key]) #Printing participant information. CHI (target child), MOT(mother), INV (investigator) #something is wrong in my print corpus_participants = childes.participants(childes.fileids()) for this_corpus_participants in corpus_participants[3:5]: for key in sorted(this_corpus_participants.keys()): dct = this_corpus_participants[key] print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())]) #printing words childes.words()[0:100] #printing sentences childes.sents()[0:100] #use word stems (e.g., 'is' -> 'be-3PS') instread of the original words.
# Iterates through the directory xml_files = xtract_XML_files(directory_path) # Creates a CSV file with open(directory + '_' + file_name + '.csv', 'w') as csvfile: fieldnames = ['Corpus', 'File', 'Name', 'Verb', 'Age', 'Sent', 'Syntatic Object', 'Event or Object'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for xml_folder in xml_files: corpus_folder = CHILDESCorpusReader(xml_folder, '.*.xml') # Stores the data of the corpra corpus_data = corpus_folder.corpus(corpus_folder.fileids()) # Prints out corpus & child information for file in corpus_folder.fileids(): # Stores all the sentences spoken by the speaker corpus_sents = corpus_folder.sents(file, speaker=speaker) # Stores all the sentences, words in stem form corpus_sents_stems = corpus_folder.sents(file, speaker=speaker, stem=True) corpus_participant = corpus_folder.participants(file) # Searches through each sentence for a match for stem_sents, sents in zip(corpus_sents_stems, corpus_sents):
def main(): nltk_download_dir = '/home/rodriguesfas/nltk_data' brown_corpus_root = os.path.join(nltk_download_dir, 'corpora/CHILDES/Eng-NA-MOR/Valian/') brown_corpus = CHILDESCorpusReader(root=brown_corpus_root, fileids='.+xml') #exibe os arquivos print brown_corpus.fileids() #conta o numero de arquivos print len(brown_corpus.fileids()) #exibe propriedade dos arquivos corpus_data = brown_corpus.corpus(brown_corpus.fileids()) print(corpus_data[0]['Lang']) for key in sorted(corpus_data[0].keys()): print(key, ": ", corpus_data[0][key]) # Imprimindo informações dos participantes do corpus. # Os códigos mais comuns para os participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador). corpus_participants = brown_corpus.participants(brown_corpus.fileids()) for this_corpus_participants in corpus_participants[:2]: for key in sorted(this_corpus_participants.keys()): dct = this_corpus_participants[key] print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())]) # printing words. print brown_corpus.words('01a.xml') # printing sentences. print brown_corpus.sents('01a.xml') #You can specify the participants with the argument speaker. print brown_corpus.words('01a.xml', speaker=['INV']) print brown_corpus.words('01a.xml', speaker=['MOT']) print brown_corpus.words('01a.xml', speaker=['CHI']) # tagged_words() and tagged_sents() return the usual (word,pos) tuple lists. # POS tags in the CHILDES are automatically assigned by MOR and POST programs (MacWhinney, 2000). print brown_corpus.tagged_words('01a.xml')[:30] print brown_corpus.tagged_sents('01a.xml')[:10] # When the argument stem is true, the word stems (e.g., 'is' -> 'be-3PS') are used instread of the original words. print brown_corpus.words('01a.xml')[:30] print brown_corpus.words('01a.xml', stem=True)[:30] #When the argument replace is true, the replaced words are used instread of the original words. print brown_corpus.words('01a.xml', speaker='CHI')[247] print brown_corpus.words('01a.xml', speaker='CHI', replace=True)[247] # When the argument relation is true, the relational relationships in the sentence are returned. # See Sagae et al. (2010) for details of the relational structure adopted in the CHILDES. print brown_corpus.words('01a.xml', relation=True)[:10] # Printing age. When the argument month is true, the age information in the CHILDES format is converted into the number of months. print brown_corpus.age() print brown_corpus.age('01a.xml') print brown_corpus.age('01a.xml', month=True) # Printing MLU. The criteria for the MLU computation is broadly based on Brown (1973). print brown_corpus.MLU() print brown_corpus.MLU('01a.xml') # Basic stuff # Count the number of words and sentences of each file. for this_file in brown_corpus.fileids()[:6]: print( brown_corpus.corpus(this_file)[0]['Corpus'], brown_corpus.corpus(this_file)[0]['Id']) print("num of words: %i" % len(brown_corpus.words(this_file))) print("num of sents: %i" % len(brown_corpus.sents(this_file)))
import nltk from nltk.corpus.reader import CHILDESCorpusReader corpus_root = nltk.data.find( '/home/rodriguesfas/Mestrado/workspace/specana.prototype/dataset/corpora/childes/data/eng-uk/Belfast/' ) ccr = CHILDESCorpusReader(corpus_root, 'Barbara/.*.xml') print ccr.fileids() # Conte o número de arquivos. print len(ccr.fileids()) # Imprimindo propriedades dos arquivos corpus. corpus_data = ccr.corpus(ccr.fileids()) print(corpus_data[0]['Lang']) for key in sorted(corpus_data[0].keys()): print(key, ": ", corpus_data[0][key]) # Imprimindo informações dos participantes do corpus. Os códigos mais comuns para os # participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador). corpus_participants = ccr.participants(ccr.fileids()) for this_corpus_participants in corpus_participants[:2]: for key in sorted(this_corpus_participants.keys()): dct = this_corpus_participants[key] print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())]) # printing words.
# CHILDES Corpus - NLTK can deal with the xml format of the CHILDES corpus - CHILDES xml is available at [https://childes.talkbank.org/data-xml/](https://childes.talkbank.org/data-xml/) import nltk from nltk.corpus.reader import CHILDESCorpusReader r = CHILDESCorpusReader('../../../Corpus/CHILDES/Chinese/Chang1_xml/', '.*.xml') r.fileids() # print basic profile for each xml for f in r.fileids()[:5]: cur_corpus = r.corpus(f)[0] print(cur_corpus['Corpus'], cur_corpus['PID'], cur_corpus['ActivityType'], cur_corpus['Date']) print("Num of Words: {}".format(len(r.words(f)))) print("Num of Sents: {}".format(len(r.sents(f)))) # participants r.participants(fileids=r.fileids()[10])[0]# first file participants all_speakers = r.participants() for speakers_cur_file in all_speakers[:5]: print("====") for spid in speakers_cur_file.keys(): cur_spid_data = speakers_cur_file[spid]