def test_instantiate_reader(classmethod, arg): """`read_chat` and the from_x classmethods works the same.""" from_classmethod = classmethod(arg, encoding="utf-8") from_read_chat = read_chat(REMOTE_EVE_FILE_PATH_1, encoding="utf-8") # "header" and "index_to_tiers" combined cover the entire data file header_from_classmethod = list(from_classmethod.headers().values())[0] header_from_read_chat = list(from_read_chat.headers().values())[0] index_to_tiers_from_classmethod = list( from_classmethod.index_to_tiers().values())[0] index_to_tiers_from_read_chat = list( from_read_chat.index_to_tiers().values())[0] assert header_from_classmethod == header_from_read_chat assert len(index_to_tiers_from_classmethod) == len( index_to_tiers_from_read_chat) for (i_c, tier_c), (i_r, tier_r) in zip( sorted(index_to_tiers_from_classmethod.items()), sorted(index_to_tiers_from_read_chat.items()), ): try: assert tier_c == tier_r except AssertionError: print("i_c:", i_c, "i_r:", i_r) raise
import pylangacq as pla import os os.chdir('/home/lucas/PycharmProjects/Papers_with_code/data/CALL_HOME/eng') print(os.getcwd()) CHAT = pla.read_chat('0638.cha') print(CHAT.part_of_speech_tags())
def eve_all_files(): return read_chat(REMOTE_EVE_FILE_PATH_ALL_FILES, encoding="utf-8")
def test_update(eve_one_file): new_eve = read_chat(REMOTE_EVE_FILE_PATH_2) eve_one_file.update(new_eve) assert len(eve_one_file) == 2
def eve_one_file(): return read_chat(LOCAL_EVE_PATH, encoding="utf-8")
import pylangacq import glob files = glob.glob('cha/*') broken_files = ('cha/roberts2.cha', 'cha/fusser11.cha') good_files = tuple(f for f in files if f not in broken_files) # HACK: loading takes a long time, so use fewer files for testing # good_files = good_files[:2] print "Loading corpus" corpus = pylangacq.read_chat(*good_files) trigrams = corpus.word_ngrams(3) print "Most popular trigrams:" trigram_items_by_freq = sorted(trigrams.items(), key=lambda item: item[1], reverse=True) for trigram, freq in trigram_items_by_freq[:30]: print trigram, freq print "Some sentences:" for sentence in corpus.sents()[:10]: print u' '.join(sentence)
def test_read_chat_wrong_filename_type(): with pytest.raises(ValueError): read_chat(42)
seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] PATH = os.getcwd() with open(PATH + '\\data\\not_char.json', 'r') as fp: not_char = json.load(fp) sentences = [] for file in os.listdir(PATH + '\\data\\CHAT_Files'): print("processing file : " + file) if file.endswith('.cha'): chat_data = pyl.read_chat(PATH + '\\data\\CHAT_Files\\' + file) sen_data = chat_data.sents() for sen in sen_data: temp_sent = "" for word in sen: w = word.lower() for nc in not_char: w = w.replace(nc, '') if w != '': if not (len(set(w)) == 1 and 'x' in set(w)): temp_sent += w + ' ' tsen = temp_sent[:-1] if tsen != '': sentences.append(tsen)
OUTPUT_PATHS = 'intermediate/childes_transcripts_used.csv' # From Clark (1973). MIN_AGE = 13 MAX_AGE = 30 with open('data/all_corpora_paths.txt', 'r') as f: filenames = [line.strip() for line in f] filepaths = [os.path.join(CHILDES, fname) for fname in filenames] # Filter by age and no parsing errors. good_filepaths = [] for i, fpath in enumerate(filepaths): try: print("Pre-reading %d/%d" % (i + 1, len(filepaths))) reader = pla.read_chat(fpath) age = reader.age(months=True) age = list(age.values())[0] if MIN_AGE <= age <= MAX_AGE: good_filepaths.append(fpath) except Exception as e: print("Warning: failed to read %s" % fpath) print(e) print() # Count nouns. noun_counts = {}
%%R -i age_mlu_data age_mlu_data %>% ggplot(aes(Age, MLU)) + geom_point(size=2) + geom_smooth(method="lm") + labs(x="Child Age(Months)",y="Mean Length of Utterances (MLU)") ## CHA file - Fantastic package for CHA files: [PyLangAcq](http://pylangacq.org/) import pylangacq as pla pylangacq.__version__ # show version number nccu = pla.read_chat('../../../Corpus/NCCUTaiwanMandarin/transcript/*.cha') nccu.number_of_files() print('Corpus Size:', len(nccu.words())) all_headers= nccu.headers() #all_headers[list(all_headers.keys())[0]] list(all_headers.items())[0] nccu.word_frequency().most_common(5) nccu.word_ngrams(n=3).most_common(10) for line in [' '.join(sent) for sent in nccu.sents()[:10]]: print(line)
library(dplyr) %%R -i age_mlu_data age_mlu_data %>% ggplot(aes(Age, MLU)) + geom_point(size=2) + geom_smooth(method="lm") + labs(x="Child Age(Months)",y="Mean Length of Utterances (MLU)") ## CHA file - Fantastic package for CHA files: [PyLangAcq](http://pylangacq.org/) import pylangacq as pla pla.__version__ # show version number nccu = pla.read_chat(DEMO_DATA_ROOT+'/CHILDES_NCCU/transcript/*.cha') nccu.number_of_files() print('Corpus Size:', len(nccu.words())) all_headers= nccu.headers() #all_headers[list(all_headers.keys())[0]] list(all_headers.items())[0] nccu.word_frequency().most_common(5) nccu.word_ngrams(n=3).most_common(10) for line in [' '.join(sent) for sent in nccu.sents()[:10]]: print(line)
def getFile(filepath): file = pla.read_chat(filepath) return file
import os from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import GaussianNB from importChatFile import getFile from posTagging import posTagsFromObject, posTagsFromFile, posTrigrams from pprint import pprint import pylangacq as pla from sklearn import metrics import numpy as np from pprint import pprint from sklearn.preprocessing import OneHotEncoder train_files = pla.read_chat( "/Users/sandeep/Google Drive/Sandeep/College/4th Semester/Computational Linguistics/Computation-Linguistics-Final-Project/Data/Train/*.cha" ) types = {'SE': 3, 'LP': 1, 'SD': 2} X_train = [] y_train = [] for file in train_files.filenames(): item = posTagsFromFile(file) y_train.append(types[os.path.basename(file)[0:2]]) X_train.append(item) labels = [] for x in X_train: for x1 in x: if x1 not in labels: labels.append(x1)
# libraries for linguistic research import pylangacq as pla import pycantonese as pc # Part 1: Computing MLUm # =============== # Reading CHAT transcripts of Eve in Brown # ----------------------------------------------- # # We assume that Eve's data from the Brown portion of CHILDES ([source](http://childes.psy.cmu.edu/data/Eng-NA-MOR/Brown.zip); accessed in January 2016) are available at the current directory. # In[2]: eve = pla.read_chat('Brown/Eve/*.cha') # reading in all 20 files from eve01.cha to eve20.cha # Getting information of interest # ------------------------------------ # # 1. filenames # 2. age (in months) of Eve's in each CHAT file # 3. mean length of utterance in morphemes (MLUm) of each file # In[3]: eve_filenames = eve.filenames(sorted_by_age=True) # absolute-path filenames sorted by age eve_ages = eve.age(months=True) # dict(filename: age in months) eve_MLUs = eve.MLUm() # dict(filename: MLUm) eve_age_MLU_pairs = [(eve_ages[fn], eve_MLUs[fn]) for fn in eve_filenames] # list of (age, MLUm)