Beispiel #1
0
def test_instantiate_reader(classmethod, arg):
    """`read_chat` and the from_x classmethods works the same."""
    from_classmethod = classmethod(arg, encoding="utf-8")
    from_read_chat = read_chat(REMOTE_EVE_FILE_PATH_1, encoding="utf-8")

    # "header" and "index_to_tiers" combined cover the entire data file
    header_from_classmethod = list(from_classmethod.headers().values())[0]
    header_from_read_chat = list(from_read_chat.headers().values())[0]

    index_to_tiers_from_classmethod = list(
        from_classmethod.index_to_tiers().values())[0]
    index_to_tiers_from_read_chat = list(
        from_read_chat.index_to_tiers().values())[0]

    assert header_from_classmethod == header_from_read_chat
    assert len(index_to_tiers_from_classmethod) == len(
        index_to_tiers_from_read_chat)

    for (i_c, tier_c), (i_r, tier_r) in zip(
            sorted(index_to_tiers_from_classmethod.items()),
            sorted(index_to_tiers_from_read_chat.items()),
    ):
        try:
            assert tier_c == tier_r
        except AssertionError:
            print("i_c:", i_c, "i_r:", i_r)
            raise
Beispiel #2
0
import pylangacq as pla
import os

os.chdir('/home/lucas/PycharmProjects/Papers_with_code/data/CALL_HOME/eng')
print(os.getcwd())

CHAT = pla.read_chat('0638.cha')
print(CHAT.part_of_speech_tags())

Beispiel #3
0
def eve_all_files():
    return read_chat(REMOTE_EVE_FILE_PATH_ALL_FILES, encoding="utf-8")
Beispiel #4
0
def test_update(eve_one_file):
    new_eve = read_chat(REMOTE_EVE_FILE_PATH_2)
    eve_one_file.update(new_eve)
    assert len(eve_one_file) == 2
Beispiel #5
0
def eve_one_file():
    return read_chat(LOCAL_EVE_PATH, encoding="utf-8")
Beispiel #6
0
import pylangacq
import glob

files = glob.glob('cha/*')
broken_files = ('cha/roberts2.cha', 'cha/fusser11.cha')
good_files = tuple(f for f in files if f not in broken_files)

# HACK: loading takes a long time, so use fewer files for testing
# good_files = good_files[:2]

print "Loading corpus"
corpus = pylangacq.read_chat(*good_files)

trigrams = corpus.word_ngrams(3)
print "Most popular trigrams:"
trigram_items_by_freq = sorted(trigrams.items(),
                               key=lambda item: item[1],
                               reverse=True)
for trigram, freq in trigram_items_by_freq[:30]:
    print trigram, freq

print "Some sentences:"
for sentence in corpus.sents()[:10]:
    print u' '.join(sentence)
Beispiel #7
0
def test_read_chat_wrong_filename_type():
    with pytest.raises(ValueError):
        read_chat(42)
Beispiel #8
0
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]


PATH = os.getcwd()

with open(PATH + '\\data\\not_char.json', 'r') as fp:
    not_char = json.load(fp)

sentences = []

for file in os.listdir(PATH + '\\data\\CHAT_Files'):
    print("processing file : " + file)
    if file.endswith('.cha'):
        chat_data = pyl.read_chat(PATH + '\\data\\CHAT_Files\\' + file)
        sen_data = chat_data.sents()

        for sen in sen_data:
            temp_sent = ""
            for word in sen:
                w = word.lower()
                for nc in not_char:
                    w = w.replace(nc, '')
                if w != '':
                    if not (len(set(w)) == 1 and 'x' in set(w)):
                        temp_sent += w + ' '
            tsen = temp_sent[:-1]

            if tsen != '':
                sentences.append(tsen)
Beispiel #9
0
OUTPUT_PATHS = 'intermediate/childes_transcripts_used.csv'

# From Clark (1973).
MIN_AGE = 13
MAX_AGE = 30

with open('data/all_corpora_paths.txt', 'r') as f:
    filenames = [line.strip() for line in f]
    filepaths = [os.path.join(CHILDES, fname) for fname in filenames]

# Filter by age and no parsing errors.
good_filepaths = []
for i, fpath in enumerate(filepaths):
    try:
        print("Pre-reading %d/%d" % (i + 1, len(filepaths)))
        reader = pla.read_chat(fpath)

        age = reader.age(months=True)
        age = list(age.values())[0]

        if MIN_AGE <= age <= MAX_AGE:
            good_filepaths.append(fpath)

    except Exception as e:
        print("Warning: failed to read %s" % fpath)
        print(e)
        print()

# Count nouns.
noun_counts = {}
Beispiel #10
0
%%R -i age_mlu_data
age_mlu_data %>%
ggplot(aes(Age, MLU)) +
geom_point(size=2) +
geom_smooth(method="lm") +
labs(x="Child Age(Months)",y="Mean Length of Utterances (MLU)")

## CHA file

- Fantastic package for CHA files: [PyLangAcq](http://pylangacq.org/)

import pylangacq as pla
pylangacq.__version__  # show version number

nccu = pla.read_chat('../../../Corpus/NCCUTaiwanMandarin/transcript/*.cha')

nccu.number_of_files()

print('Corpus Size:', len(nccu.words()))

all_headers= nccu.headers()
#all_headers[list(all_headers.keys())[0]]
list(all_headers.items())[0]

nccu.word_frequency().most_common(5)
nccu.word_ngrams(n=3).most_common(10)

for line in [' '.join(sent) for sent in nccu.sents()[:10]]:
    print(line)
Beispiel #11
0
library(dplyr)

%%R -i age_mlu_data
age_mlu_data %>%
ggplot(aes(Age, MLU)) +
geom_point(size=2) +
geom_smooth(method="lm") +
labs(x="Child Age(Months)",y="Mean Length of Utterances (MLU)")

## CHA file

- Fantastic package for CHA files: [PyLangAcq](http://pylangacq.org/)

import pylangacq as pla
pla.__version__  # show version number

nccu = pla.read_chat(DEMO_DATA_ROOT+'/CHILDES_NCCU/transcript/*.cha')

nccu.number_of_files()

print('Corpus Size:', len(nccu.words()))

all_headers= nccu.headers()
#all_headers[list(all_headers.keys())[0]]
list(all_headers.items())[0]

nccu.word_frequency().most_common(5)
nccu.word_ngrams(n=3).most_common(10)

for line in [' '.join(sent) for sent in nccu.sents()[:10]]:
    print(line)
def getFile(filepath):
    file = pla.read_chat(filepath)
    return file
import os

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from importChatFile import getFile
from posTagging import posTagsFromObject, posTagsFromFile, posTrigrams
from pprint import pprint
import pylangacq as pla
from sklearn import metrics
import numpy as np
from pprint import pprint
from sklearn.preprocessing import OneHotEncoder

train_files = pla.read_chat(
    "/Users/sandeep/Google Drive/Sandeep/College/4th Semester/Computational Linguistics/Computation-Linguistics-Final-Project/Data/Train/*.cha"
)

types = {'SE': 3, 'LP': 1, 'SD': 2}

X_train = []
y_train = []
for file in train_files.filenames():
    item = posTagsFromFile(file)
    y_train.append(types[os.path.basename(file)[0:2]])
    X_train.append(item)

labels = []
for x in X_train:
    for x1 in x:
        if x1 not in labels:
            labels.append(x1)
Beispiel #14
0
# libraries for linguistic research
import pylangacq as pla
import pycantonese as pc


# Part 1: Computing MLUm
# ===============

# Reading CHAT transcripts of Eve in Brown
# -----------------------------------------------
# 
# We assume that Eve's data from the Brown portion of CHILDES ([source](http://childes.psy.cmu.edu/data/Eng-NA-MOR/Brown.zip); accessed in January 2016) are available at the current directory.

# In[2]:

eve = pla.read_chat('Brown/Eve/*.cha')  # reading in all 20 files from eve01.cha to eve20.cha


# Getting information of interest
# ------------------------------------
# 
# 1. filenames
# 2. age (in months) of Eve's in each CHAT file
# 3. mean length of utterance in morphemes (MLUm) of each file 

# In[3]:

eve_filenames = eve.filenames(sorted_by_age=True)  # absolute-path filenames sorted by age
eve_ages = eve.age(months=True)  # dict(filename: age in months)
eve_MLUs = eve.MLUm()  # dict(filename: MLUm)
eve_age_MLU_pairs = [(eve_ages[fn], eve_MLUs[fn]) for fn in eve_filenames]  # list of (age, MLUm)