Beispiel #1
0
def main(args):
  corpus = CHILDESCorpusReader(args.dir, args.glob)
  for fileid in corpus.fileids():
    for sentence in corpus.words(fileid, relation=True):
      try:
        print_conllu(sentence, sys.stdout)
      except:
        # Some of the sentences bork because the parses aren't complete. Oh well.
        pass
Beispiel #2
0
def main():
    nltk_download_dir = '/home/rodriguesfas/nltk_data'

    brown_corpus_root = os.path.join(nltk_download_dir,
                                     'corpora/CHILDES/Eng-NA-MOR/Valian/')
    brown_corpus = CHILDESCorpusReader(root=brown_corpus_root, fileids='.+xml')

    print brown_corpus.fileids()[:5]

    fileids = ['02b.xml', '03a.xml']

    print brown_corpus.age(fileids=fileids)
    print brown_corpus.MLU(fileids=fileids)

    print {fid: brown_corpus.age(fileids=fid)[0] for fid in fileids}
    print {fid: brown_corpus.MLU(fileids=fid)[0] for fid in fileids}

    print {
        fid: brown_corpus.age(fileids=fid, month=True)[0]
        for fid in fileids
    }

    metadata = brown_corpus.participants(fileids='03a.xml')[0]

    ## comment this if you don't have the pretty package
    print(metadata)

    ## uncomment this if you don't have the pretty package
    #print metadata

    print 'words:', brown_corpus.words(fileids='03a.xml')[:7]
    print 'sents:', brown_corpus.sents(fileids='03a.xml')[:3]

    print 'tagged words:', brown_corpus.tagged_words(fileids='03a.xml')[:7]
    print 'tagged sents:', brown_corpus.tagged_sents(fileids='03a.xml')[:3]

    print "Adam:", '\t', brown_corpus.sents(fileids='03a.xml',
                                            speaker='CHI')[:5]
    print "Mother:", brown_corpus.sents(fileids='03a.xml', speaker='MOT')[:2]

    mother_unstemmed = brown_corpus.sents(fileids='03a.xml', speaker='MOT')

    mother_stemmed = brown_corpus.sents(fileids='03a.xml',
                                        speaker='MOT',
                                        stem=True)
    mother_root = [[
        stemmed_word.split('-')[0] for stemmed_word in stemmed_sent
    ] for stemmed_sent in mother_stemmed]

    print 'Raw:\t\t', mother_unstemmed[:2]
    print 'Stemmed:\t', mother_stemmed[:2]
    print 'Roots only:\t', mother_root[:2]
Beispiel #3
0
def scandirs(path, d):

    corpus_root = nltk.data.find(path)
    for currentFile in glob.glob(os.path.join(path, '*')):
        if os.path.isdir(currentFile):
            print(currentFile)
            s = sentence_cut(currentFile)  # returns name of kid (directory)
            s += '/.*.xml'
            manchester = CHILDESCorpusReader(corpus_root, s)
            li = manchester.words(speaker='MOT')  #only the parents words
            for i in li:
                if (i != 'xxx'):
                    d[len(i)].append(i)
Beispiel #4
0
def scandirs(path, part_ofspeech, dependencies):
    corpus_root = nltk.data.find(path)
    for currentFile in glob.glob(os.path.join(path, '*')):
        if os.path.isdir(currentFile):
            print(currentFile)
            s = sentence_cut(currentFile)  # returns name of kid (directory)
            s += '/.*.xml'
            print(s)
            manchester = CHILDESCorpusReader(corpus_root, s)
            li = manchester.words(relation=True,
                                  speaker='MOT')  #only the parents words
            for i in li:
                for k in i:
                    if (len(k) >= 3):
                        depen(k, dependencies)
                    partOfSpeech(k, part_ofspeech)
Beispiel #5
0
    t = time.time()  # Initialization
    output = []
    d = cmudict.dict()
    parser = English()

    # get corpus directories
    corpus_root_xml = nltk.data.find(
        'C:\\Users\\James\\PycharmProjects\\FIT3036\\xml')
    corpus_root_plain = 'C:\\Users\\James\\PycharmProjects\\FIT3036\\plain_text'

    # get all xml and plain text files from specified directories
    corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml')
    corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha')

    # get all the words spoken by a child
    all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])]

    # init wordnet and language model
    corpus_ic = wn.ic(corpus_xml, True, 1.0)
    lm = LanguageModel(all_words)

    # collect all the features for each corpus
    for j in range(len(corpus_xml.fileids())):
        current_features = []  # init empty array to store features
        # Text initialization
        text_xml = corpus_xml.fileids()[j]
        text_plain = corpus_plain.fileids()[j]

        # list of words spoken by the child in lowercase
        child_words_xml = [
            w.lower() for w in corpus_xml.words(text_xml, speaker=['CHI'])
Beispiel #6
0
#count the number of files
len(childes.fileids())
#printing properties of the corpus files
corpus_data = childes.corpus(childes.fileids())
print(corpus_data[0]['Lang'])
for key in sorted(corpus_data[1].keys()):
    print(key ,":", corpus_data[1][key])
#Printing participant information. CHI (target child), MOT(mother), INV (investigator)
#something is wrong in my print
corpus_participants = childes.participants(childes.fileids())
for this_corpus_participants in corpus_participants[3:5]:
    for key in sorted(this_corpus_participants.keys()):
        dct = this_corpus_participants[key]
        print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
#printing words
childes.words()[0:100]
#printing sentences
childes.sents()[0:100]
#use word stems (e.g., 'is' -> 'be-3PS') instread of the original words.
childes.words()[:30]
childes.words(stem=True)[:30]
#Printing age. When the argument month is true, the age information in the CHILDES format is converted into the number of months.
childes.age('/Providence/Alex/ale01.xml')
childes.age('Providence/Alex/ale01.xml', month=True)
#other schtuff, counting words.
for this_file in providence.fileids()[:10]:
    print(providence.corpus(this_file)[0]['Corpus'], providence.corpus(this_file)[0]['Media'])
    print("num of words: %i" % len(providence.words(this_file)))
    print("num of sents: %i" % len(providence.sents(this_file)))

print(corpus_data[0]['Lang'])

for key in sorted(corpus_data[0].keys()):
    print(key, ": ", corpus_data[0][key])

# Imprimindo informações dos participantes do corpus. Os códigos mais comuns para os
# participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador).
corpus_participants = ccr.participants(ccr.fileids())

for this_corpus_participants in corpus_participants[:2]:
    for key in sorted(this_corpus_participants.keys()):
        dct = this_corpus_participants[key]
        print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

# printing words.
print ccr.words('Barbara/020409.xml')

# printing sentences.
print ccr.sents('Barbara/020409.xml')

# Você pode especificar os participantes com o argumento speaker.
print ccr.words('Barbara/020409.xml', speaker=['INV'])
print ccr.words('Barbara/020409.xml', speaker=['MOT'])
print ccr.words('Barbara/020409.xml', speaker=['CHI'])

# tagged_words () e tagged_sents () retornam as listas usuais de tuplas (word, pos).
# As tags POS nos CHILDES são automaticamente atribuídas pelos programas MOR e POST (MacWhinney, 2000).
print ccr.tagged_words('Barbara/020409.xml')[:30]
print ccr.tagged_sents('Barbara/020409.xml')[:10]

# Quando o argumento stem é true, a palavra stems (por exemplo, 'is' -> 'be-3PS')
def main():
    nltk_download_dir = '/home/rodriguesfas/nltk_data'

    brown_corpus_root = os.path.join(nltk_download_dir,
                                     'corpora/CHILDES/Eng-NA-MOR/Valian/')

    brown_corpus = CHILDESCorpusReader(root=brown_corpus_root, fileids='.+xml')

    #exibe os arquivos
    print brown_corpus.fileids()

    #conta o numero de arquivos
    print len(brown_corpus.fileids())

    #exibe propriedade dos arquivos
    corpus_data = brown_corpus.corpus(brown_corpus.fileids())
    print(corpus_data[0]['Lang'])

    for key in sorted(corpus_data[0].keys()):
        print(key, ": ", corpus_data[0][key])

    # Imprimindo informações dos participantes do corpus.
    # Os códigos mais comuns para os participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador).
    corpus_participants = brown_corpus.participants(brown_corpus.fileids())
    for this_corpus_participants in corpus_participants[:2]:
        for key in sorted(this_corpus_participants.keys()):
            dct = this_corpus_participants[key]
            print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

    # printing words.
    print brown_corpus.words('01a.xml')

    # printing sentences.
    print brown_corpus.sents('01a.xml')

    #You can specify the participants with the argument speaker.
    print brown_corpus.words('01a.xml', speaker=['INV'])
    print brown_corpus.words('01a.xml', speaker=['MOT'])
    print brown_corpus.words('01a.xml', speaker=['CHI'])

    # tagged_words() and tagged_sents() return the usual (word,pos) tuple lists.
    # POS tags in the CHILDES are automatically assigned by MOR and POST programs (MacWhinney, 2000).
    print brown_corpus.tagged_words('01a.xml')[:30]

    print brown_corpus.tagged_sents('01a.xml')[:10]

    # When the argument stem is true, the word stems (e.g., 'is' -> 'be-3PS') are used instread of the original words.
    print brown_corpus.words('01a.xml')[:30]
    print brown_corpus.words('01a.xml', stem=True)[:30]

    #When the argument replace is true, the replaced words are used instread of the original words.
    print brown_corpus.words('01a.xml', speaker='CHI')[247]
    print brown_corpus.words('01a.xml', speaker='CHI', replace=True)[247]

    # When the argument relation is true, the relational relationships in the sentence are returned.
    # See Sagae et al. (2010) for details of the relational structure adopted in the CHILDES.
    print brown_corpus.words('01a.xml', relation=True)[:10]

    # Printing age. When the argument month is true, the age information in the CHILDES format is converted into the number of months.
    print brown_corpus.age()
    print brown_corpus.age('01a.xml')
    print brown_corpus.age('01a.xml', month=True)

    # Printing MLU. The criteria for the MLU computation is broadly based on Brown (1973).
    print brown_corpus.MLU()
    print brown_corpus.MLU('01a.xml')

    # Basic stuff
    # Count the number of words and sentences of each file.

    for this_file in brown_corpus.fileids()[:6]:
        print(
            brown_corpus.corpus(this_file)[0]['Corpus'],
            brown_corpus.corpus(this_file)[0]['Id'])
        print("num of words: %i" % len(brown_corpus.words(this_file)))
        print("num of sents: %i" % len(brown_corpus.sents(this_file)))
Beispiel #9
0

import nltk
from nltk.corpus.reader import CHILDESCorpusReader
r = CHILDESCorpusReader('../../../Corpus/CHILDES/Chinese/Chang1_xml/', '.*.xml')

r.fileids()

# print basic profile for each xml
for f in r.fileids()[:5]:
    cur_corpus = r.corpus(f)[0]
    print(cur_corpus['Corpus'],
          cur_corpus['PID'],
         cur_corpus['ActivityType'],
         cur_corpus['Date'])
    print("Num of Words: {}".format(len(r.words(f))))
    print("Num of Sents: {}".format(len(r.sents(f))))

# participants
r.participants(fileids=r.fileids()[10])[0]# first file participants

all_speakers = r.participants()

for speakers_cur_file in all_speakers[:5]:
    print("====")
    for spid in speakers_cur_file.keys():
        cur_spid_data = speakers_cur_file[spid]
        print(spid, ": ", [(param, cur_spid_data[param]) for param in cur_spid_data.keys()] )

r.words('01.xml')
print(r.sents('01.xml', speaker='EXP'))
lemmatizer = WordNetLemmatizer()

file_count = 0
writefile_children = open(
    "/Users/tessacharlesworth/Desktop/Embeddings/Clean Data/CHILDES clean text/corpus_children.txt",
    'w+'
)  # change to local directory where the combined text files should be stored; keep "corpus_children.txt", 'w+'" at the end
for (root, dirs, files) in os.walk(
        "/Users/tessacharlesworth/Desktop/Embeddings/Raw Data/CHILDES raw text/",
        topdown=True
):  # change to local directory where the raw text files are stored
    for file in files:
        if file[-4:] == '.xml':
            print(file)
            output = childes.words(os.path.join(root, file),
                                   speaker=['CHI'],
                                   replace=True)
            output = list(filter(lambda a: a != 'xxxxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxx', output))
            output = list(filter(lambda a: a != 'xxxx', output))
            output = list(filter(lambda a: a != 'xxx', output))
            output = list(filter(lambda a: a != 'xx', output))
            output = list(filter(lambda a: a != 'yyy', output))
            output = list(filter(lambda a: a != 'yy', output))
            output = (' '.join(output)).lower()
            writefile_children.write(output)
            file_count += 1

writefile_children.close()