def get_childes_files(
    root_location, file_name
):  # fetches the childes file in xml and parses it into utterances with speaker in [0] position
    global ordered_utterance_list
    corpus_root = nltk.data.find(root_location)
    file_setup = CHILDESCorpusReaderX(corpus_root, file_name)
    ordered_utterance_list = file_setup.sents()
    return (ordered_utterance_list)
Ejemplo n.º 2
0
def get_childes_stemmed(root_location, file_name):
    global ordered_utterance_list
    stemmer = SnowballStemmer("english")
    corpus_root = nltk.data.find(root_location, paths=['.'])
    file_setup = CHILDESCorpusReaderX(corpus_root, file_name)
    ordered_utterance_list = file_setup.sents()
    for utterance in ordered_utterance_list:
        for i in range(1, len(utterance) - 1):
            utterance[i] = stemmer.stem(utterance[i])
    return (ordered_utterance_list)
Ejemplo n.º 3
0
def get_childes_files(
    root_location, file_name
):  # fetches the childes file in xml and parses it into utterances with speaker in [0] position
    global ordered_utterance_list
    corpus_root = nltk.data.find(root_location, paths=['.'])
    file_setup = CHILDESCorpusReaderX(corpus_root, file_name)
    if extractRoles:
        global writeRoleHeader
        corpus_participants = file_setup.participants()
        for this_corpus_participants in corpus_participants[:2]:
            for key in sorted(this_corpus_participants.keys()):
                dct = this_corpus_participants[key]
                if writeRoleHeader:
                    wrf.write('docId,')
                    wrf.write(','.join([k for k in sorted(dct.keys())]) + '\n')
                    writeRoleHeader = False
                wrf.write(file_name + ',')
                wrf.write(','.join([dct[k]
                                    for k in sorted(dct.keys())]) + '\n')
        #return								#comment this out eventually
    ordered_utterance_list = file_setup.sents()
    return (ordered_utterance_list)
Ejemplo n.º 4
0
def get_childes_files(root_location, file_name): # fetches the childes file in xml and parses it into utterances with speaker in [0] position
	global info_list
	corpus_root = nltk.data.find(root_location) 
	file_setup = CHILDESCorpusReaderX(corpus_root, file_name)
	get_child_info(file_setup)
	return(info_list)