def update(self, toupdate=True): """ Updates the ISO 639-3, if internet connection is available. """ # Saving contents from iso-639-3.tab into ISO6393 object. iso6393_tsv = sync_and_read(ISO6393_URL, ISO6393_TXT, toupdate=toupdate) iso6393_data = iso6393_tsv.partition('\n')[2] # Removes headerlines. headerline = "iso6392t iso6392b iso6391 scope type name comment" for i in iso6393_data.split('\n'): code, _, i = i.strip().partition('\t') for value, column in zip(i.split('\t'), headerline.split()): if value in scopetype: value = scopetype[value]; self.ISO6393.setdefault(code,{})[column]= value # Saving contents from iso-639-3_Name_Index.tab into ISO6393 object. iso6393name_tsv = sync_and_read(ISO6393_NAME_URL, ISO6393_NAME_TXT, toupdate=toupdate) iso6393name_data = iso6393name_tsv.partition('\n')[2] # Removes headerlines. for i in iso6393name_data.split('\n'): code, name, invert = i.strip().split('\t') ismacrolang = True if "(macrolanguage)" in name else False self.ISO6393[code]["name"]= name self.ISO6393[code]["invert"] = invert self.ISO6393[code]["ismacro"] = ismacrolang # Saving contents from iso-639-3-macrolanguages.tab into *MACROLANGS*. marcolang_tsv = sync_and_read(MACROLANGS_URL, MACROLANGS_TXT, \ toupdate=toupdate) macrolang_data = marcolang_tsv.partition('\n')[2] self.MACROLANGS = defaultdict(list) for i in macrolang_data.split('\n'): macro, code, status = i.strip().split('\t') status = "Active" if status == "A" else "Retired" self.ISO6393.setdefault(code, {})["macro"] = macro self.ISO6393.setdefault(code, {})["status"] = status self.MACROLANGS[macro].append(code) # Saving contents from iso-639-3_Retirements.tab into *RETIRED*. retired_tsv = sync_and_read(RETIRED_URL, RETIRED_TXT, toupdate=toupdate) retired_data = retired_tsv.partition('\n')[2] self.RETIRED = defaultdict(list) for i in retired_data.split('\n'): "Id Ref_Name Ret_Reason Change_To Ret_Remedy Effective" code, refname, reason, changeto, \ remedy, effectivedate = i.strip().split('\t') if reason == "S" and "Split into" in remedy: changeto = "_".join(re.findall(r"\[(.*?)\]", remedy)) self.ISO6393.setdefault(code, {})["retired"] = True self.ISO6393.setdefault(code, {})["changeto"] = changeto
def __init__(self, toupdate=True): WALS_URL = "http://wals.info/languoid.tab?sEcho=1&iSortingCols=1"+\ "&iSortCol_0=0&sSortDir_0=asc" WALS_TXT = currentdirectory()+"/data/wals/wals.txt" wals_tsv = sync_and_read(WALS_URL, WALS_TXT, toupdate=toupdate) headerline, _ , data = wals_tsv.partition('\n') for line in data.split('\n'): lang = line.split()[0] for key, value in zip(headerline.split('\t')[1:], line.split('\t')[1:]): self.setdefault(lang,{})[key] = value self.GENUS = defaultdict(list) for lang in self: self.GENUS[self[lang]['genus']].append(lang) self.LANGUAGEFAMILY = defaultdict(list) for lang in self: self.LANGUAGEFAMILY[self[lang]['family']].append(lang) self.RELATED_LANGS = defaultdict(list) for lang in self: self.RELATED_LANGS[lang] = self.GENUS[self[lang]['genus']] + \ self.LANGUAGEFAMILY[self[lang]['family']]
# -*- coding: utf-8 -*- import cPickle as pickle import codecs from utils import sync_and_read ETHNOFAMILY_URL = "http://www.ethnologue.com/browse/families" ETHNOFAMILY_HTML = "data/ethnologue/ethnologue-family.html" sync_and_read(ETHNOFAMILY_URL, ETHNOFAMILY_HTML) x = pickle.load(codecs.open('data/ethnologue/languages_with_info.pk','rb')) for i in x: for j in x[i]: print i, j """ fin = codecs.open(ETHNO_DIR+'ethnologue-family.html','r','utf8') lang_fams = defaultdict(list) """ """ for line in fin.readlines(): line = line.decode('utf-8') # Detects the language family and its link. if LANG_FAMILY_TAG in line: