def __init__(self, toupdate=True): WALS_URL = "http://wals.info/languoid.tab?sEcho=1&iSortingCols=1"+\ "&iSortCol_0=0&sSortDir_0=asc" WALS_TXT = currentdirectory()+"/data/wals/wals.txt" wals_tsv = sync_and_read(WALS_URL, WALS_TXT, toupdate=toupdate) headerline, _ , data = wals_tsv.partition('\n') for line in data.split('\n'): lang = line.split()[0] for key, value in zip(headerline.split('\t')[1:], line.split('\t')[1:]): self.setdefault(lang,{})[key] = value self.GENUS = defaultdict(list) for lang in self: self.GENUS[self[lang]['genus']].append(lang) self.LANGUAGEFAMILY = defaultdict(list) for lang in self: self.LANGUAGEFAMILY[self[lang]['family']].append(lang) self.RELATED_LANGS = defaultdict(list) for lang in self: self.RELATED_LANGS[lang] = self.GENUS[self[lang]['genus']] + \ self.LANGUAGEFAMILY[self[lang]['family']]
def languages(): """Returns the number of languages available from original data source.""" languages = [] conversions = {"JPN":"jpn", "MAC":"mkd", "qgk":"grc"} for i in tarfile.open(currentdirectory()+'/data/odin/odin-full.tar'): lang = str(i.name).partition('.')[0] if len(lang) != 3: continue lang = conversions[lang] if lang in conversions else lang languages.append(lang) return languages
def source_sents(intarfile=currentdirectory()+'/data/odin/odin-all.tar'): """ Yield sentences from ODIN tarball. """ for infile in sorted(read_tarfile(intarfile)): language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0] conversions = {"JPN":"jpn", "MAC":"mkd", "qgk":"grc"} language = conversions[language] if language in conversions else language with codecs.open(infile,'r','utf8') as fin: for line in fin.readlines(): sentence = line.strip().split('\t')[0] yield language, sentence
def phrases(intarfile=currentdirectory()+'/data/omniglot/omniglotphrases.tar', \ onlysource=False): """ Yield source and tranlsation sentences from the clean Omniglot tarball. """ for infile in read_tarfile(intarfile): language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0] with codecs.open(infile,'r','utf8') as fin: for line in fin.readlines(): sentence, translation = line.strip().split('\t') if onlysource and sentence: yield language, sentence.strip() else: yield language, sentence, translation
def documents(intarfile=currentdirectory()+'/data/udhr/udhr-unicode.tar', \ bysentence=False): """ Yields UDHR by documents. """ for infile in read_tarfile(intarfile): #language = infile.split('/')[-1][:3] language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0] with codecs.open(infile,'r','utf8') as fin: if bysentence: for sentence in fin.readlines(): if sentence: yield language, sentence.strip() else: yield language, fin.read()
def load_odin_pickle(ODIN_PICKLE=currentdirectory()+'/data/odin/odin-docs.pk'): """ Loads odin-docs.pk and yield one IGT at a time. >>> for lang, igts in load_odin_pickle(): >>> for igt in igts: >>> print lang, igt """ # If odin-docs.pk is not available create it. if not os.path.exists(ODIN_PICKLE): odindocs = get_odin_igts() # Outputs the odin igts examples into '../data/odin/odin-docs.pk'. with codecs.open(ODIN_PICKLE,'wb') as fout: pickle.dump(odindocs, fout) # Loads the pickled file. with codecs.open(ODIN_PICKLE,'rb') as fin2: docs = pickle.load(fin2) for lang in docs: # the data might be too much for the RAM, so yield instead of return. yield (lang, docs[lang])
def get_odin_igts(ODINFILE=currentdirectory()+'/data/odin/odin-full.tar'): """ Extracts the examples from the ODIN igts and returns a defaultdict(list), where the keys are the lang iso codes and values are the examples. >>> igts = get_odin_igts() >>> for lang in igts: >>> for igt in igts[lang]: >>> print lang, igt """ tar = tarfile.open(ODINFILE) docs = defaultdict(list) for infile in tar: if '.xml' in infile.name: # there's a rogue file in the tar that is not xml. lang = infile.name[:-4].lower() ##print lang # Find the <igt>...</igt> in the xml. odinfile = tar.extractfile(infile).read() igts = bs(odinfile).findAll('igt') citations = bs(odinfile).findAll('citation') for igt, cite in zip(igts, citations): # Find the <example>...</example> in the igt. examples = bs(unicode(igt)).findAll('example') cite = remove_tags(unicode(cite)).strip(' </p>') for eg in examples: try: # Only use triplets lines and assumes that # line1: src, line2:eng, line3:gloss src, eng, gloss = bs(unicode(eg)).findAll('line') src, eng, gloss, cite = map(unicode, [src, eng, gloss, cite]) docs[lang].append((src, eng, gloss, cite)) ##print src, eng, gloss, cite except: raise; print eg return docs
def languages(): """ Returns the number of languages available from original data source. """ return [str(i.name).partition('-')[2].partition('.')[0] for i in tarfile.open(currentdirectory()+ \ '/data/omniglot/omniglotphrases.tar') \ if i.name != ""]
def source_sents(intarfile=currentdirectory()+\ '/data/omniglot/omniglotphrases.tar', onlysource=True): """ Yield clean sentences from the clean Omniglot tarball. """ return phrases(intarfile, onlysource)
# -*- coding: utf-8 -*- import codecs, re from collections import defaultdict from utils import sync_and_read, currentdirectory # Link to the ISO 639-3 file. ISO6393_URL = "http://www-01.sil.org/iso639-3/iso-639-3.tab" ISO6393_TXT = currentdirectory()+"/data/sil/iso6393.txt" # Link to the ISO 639-3 names file. ISO6393_NAME_URL = "http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab" ISO6393_NAME_TXT = currentdirectory()+"/data/sil/iso6393-name.txt" # a local copy. # Scope of language, http://www-01.sil.org/iso639-3/scope.asp # Type of language, see http://www-01.sil.org/iso639-3/types.asp # See http://www-01.sil.org/iso639-3/iso-639-3.tab scopetype = {"I":"Indvidual", "M":"Macrolanguage", "L":"Living", "E":"Extinct", "A":"Ancient", "H":"Historic", "C":"Constructed"} # Link to ISO 639-3 Macrolanguages file. MACROLANGS_URL = "http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab" MACROLANGS_TXT = currentdirectory()+"/data/sil/marcolangs.txt" # a local copy. # Link to the ISO 639-3 retirement file. RETIRED_URL = "http://www-01.sil.org/iso639-3/iso-639-3_Retirements.tab" RETIRED_TXT = currentdirectory()+"/data/sil/retired.txt" # a local copy. class MiniSIL: def __init__(self, toupdate=True): self.ISO6393, self.MARCOLANGS = {} , defaultdict(list) self.update(toupdate)
def languages(): """ Returns a list of available languages from original data source. """ langs = [i.partition('-')[2].partition('.')[0] for i in \ enumerate_udhr(intarfile=currentdirectory()+ \ '/data/udhr/udhr-unicode.tar')] return langs
def source_sents(intarfile=currentdirectory()+'/data/udhr/udhr-unicode.tar', \ bysentence=True): return sents(intarfile, bysentence)