Esempio n. 1
0
  def update(self, toupdate=True):
    """ Updates the ISO 639-3, if internet connection is available. """
    # Saving contents from iso-639-3.tab into ISO6393 object.
    iso6393_tsv = sync_and_read(ISO6393_URL, 
                                ISO6393_TXT, toupdate=toupdate)
    iso6393_data = iso6393_tsv.partition('\n')[2] # Removes headerlines.
    headerline = "iso6392t iso6392b iso6391 scope type name comment"

    for i in iso6393_data.split('\n'):
      code, _, i = i.strip().partition('\t')
      for value, column in zip(i.split('\t'), headerline.split()):
        if value in scopetype: value = scopetype[value];
        self.ISO6393.setdefault(code,{})[column]= value
    
    # Saving contents from iso-639-3_Name_Index.tab into ISO6393 object.
    iso6393name_tsv = sync_and_read(ISO6393_NAME_URL, 
                                    ISO6393_NAME_TXT, toupdate=toupdate)
    iso6393name_data = iso6393name_tsv.partition('\n')[2] # Removes headerlines.

    for i in iso6393name_data.split('\n'):
      code, name, invert = i.strip().split('\t')
      ismacrolang = True if "(macrolanguage)" in name else False
      self.ISO6393[code]["name"]= name
      self.ISO6393[code]["invert"] = invert
      self.ISO6393[code]["ismacro"] = ismacrolang
    
    # Saving contents from iso-639-3-macrolanguages.tab into *MACROLANGS*.
    marcolang_tsv = sync_and_read(MACROLANGS_URL, MACROLANGS_TXT, \
                                  toupdate=toupdate)
    macrolang_data = marcolang_tsv.partition('\n')[2]
    
    self.MACROLANGS = defaultdict(list)
    for i in macrolang_data.split('\n'):
      macro, code, status = i.strip().split('\t')
      status = "Active" if status == "A" else "Retired"
      self.ISO6393.setdefault(code, {})["macro"] =  macro
      self.ISO6393.setdefault(code, {})["status"] =  status
      self.MACROLANGS[macro].append(code)
      
  
    # Saving contents from iso-639-3_Retirements.tab into *RETIRED*.
    retired_tsv = sync_and_read(RETIRED_URL, RETIRED_TXT, toupdate=toupdate)
    retired_data = retired_tsv.partition('\n')[2]
    
    self.RETIRED = defaultdict(list)
    for i in retired_data.split('\n'):
      "Id  Ref_Name  Ret_Reason  Change_To  Ret_Remedy  Effective"
      code, refname, reason, changeto, \
      remedy, effectivedate = i.strip().split('\t')
      
      if reason == "S" and "Split into" in remedy:
        changeto = "_".join(re.findall(r"\[(.*?)\]", remedy)) 
      
      self.ISO6393.setdefault(code, {})["retired"] = True
      self.ISO6393.setdefault(code, {})["changeto"] = changeto
Esempio n. 2
0
  def __init__(self, toupdate=True):
    WALS_URL = "http://wals.info/languoid.tab?sEcho=1&iSortingCols=1"+\
            "&iSortCol_0=0&sSortDir_0=asc"
    WALS_TXT = currentdirectory()+"/data/wals/wals.txt"
                
    wals_tsv = sync_and_read(WALS_URL, WALS_TXT, toupdate=toupdate)
    headerline, _ , data = wals_tsv.partition('\n')
    
    for line in data.split('\n'):
      lang = line.split()[0]
      for key, value in zip(headerline.split('\t')[1:], line.split('\t')[1:]):
        self.setdefault(lang,{})[key] = value

    self.GENUS = defaultdict(list)
    for lang in self:
      self.GENUS[self[lang]['genus']].append(lang)

    self.LANGUAGEFAMILY = defaultdict(list)
    for lang in self:
      self.LANGUAGEFAMILY[self[lang]['family']].append(lang)
    
    self.RELATED_LANGS = defaultdict(list)
    for lang in self:
      self.RELATED_LANGS[lang] = self.GENUS[self[lang]['genus']] + \
                                self.LANGUAGEFAMILY[self[lang]['family']]
Esempio n. 3
0
# -*- coding: utf-8 -*-

import cPickle as pickle
import codecs
from utils import sync_and_read


ETHNOFAMILY_URL = "http://www.ethnologue.com/browse/families"
ETHNOFAMILY_HTML = "data/ethnologue/ethnologue-family.html"

sync_and_read(ETHNOFAMILY_URL, ETHNOFAMILY_HTML)

x = pickle.load(codecs.open('data/ethnologue/languages_with_info.pk','rb'))

for i in x:
  for j in x[i]:
    print i, j




"""
fin = codecs.open(ETHNO_DIR+'ethnologue-family.html','r','utf8')
lang_fams = defaultdict(list)
"""

"""
for line in fin.readlines():
  line = line.decode('utf-8')
  # Detects the language family and its link.
  if LANG_FAMILY_TAG in line: