def source_sents(intarfile=parentddir+'/data/odin/odin-all.tar'): """ Yield clean sentences from the clean ODIN tarball. """ for infile in sorted(read_tarfile(intarfile)): language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0] with codecs.open(infile,'r','utf8') as fin: for line in fin.readlines(): sentence = line.strip().split('\t')[0] yield language, sentence
def source_sents(intarfile=parentddir + "/data/odin/odin-cleanest.tar"): """ Yield clean sentences from the clean ODIN tarball. """ for infile in sorted(read_tarfile(intarfile)): language = infile.split("/")[-1].split("-")[1].split(".")[0].split("_")[0] with codecs.open(infile, "r", "utf8") as fin: for line in fin.readlines(): sentence = line.strip().split("\t")[0] yield language, sentence
def source_sents(intarfile=parentddir + '/data/odin/odin-all.tar'): """ Yield clean sentences from the clean ODIN tarball. """ for infile in sorted(read_tarfile(intarfile)): language = infile.split('/')[-1].split('-')[1].split('.')[0].split( '_')[0] with codecs.open(infile, 'r', 'utf8') as fin: for line in fin.readlines(): sentence = line.strip().split('\t')[0] yield language, sentence
def source_sents(intarfile=currentdirectory()+'/data/odin/odin-all.tar'): """ Yield sentences from ODIN tarball. """ for infile in sorted(read_tarfile(intarfile)): language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0] conversions = {"JPN":"jpn", "MAC":"mkd", "qgk":"grc"} language = conversions[language] if language in conversions else language with codecs.open(infile,'r','utf8') as fin: for line in fin.readlines(): sentence = line.strip().split('\t')[0] yield language, sentence
def phrases(intarfile=parentddir + "/data/omniglot/omniglotphrases.tar", onlysource=False): """ Yield source and tranlsation sentences from the clean Omniglot tarball. """ for infile in read_tarfile(intarfile): language = infile.split("/")[-1].split("-")[1].split(".")[0].split("_")[0] with codecs.open(infile, "r", "utf8") as fin: for line in fin.readlines(): sentence, translation = line.strip().split("\t") if onlysource: yield language, sentence else: yield language, sentence, translation
def documents(intarfile=parentddir+'/data/udhr/udhr-unicode.tar', \ bysentence=False): """ Yields UDHR by documents. """ for infile in read_tarfile(intarfile): #language = infile.split('/')[-1][:3] language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0] with codecs.open(infile,'r','utf8') as fin: if bysentence: for sentence in fin.readlines(): yield language, sentence else: yield language, fin.read()
def phrases(intarfile=currentdirectory()+'/data/omniglot/omniglotphrases.tar', \ onlysource=False): """ Yield source and tranlsation sentences from the clean Omniglot tarball. """ for infile in read_tarfile(intarfile): language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0] with codecs.open(infile,'r','utf8') as fin: for line in fin.readlines(): sentence, translation = line.strip().split('\t') if onlysource and sentence: yield language, sentence.strip() else: yield language, sentence, translation
def phrases(intarfile=parentddir+'/data/omniglot/omniglotphrases.tar', \ onlysource=False): """ Yield source and tranlsation sentences from the clean Omniglot tarball. """ for infile in read_tarfile(intarfile): language = infile.split('/')[-1].split('-')[1].split('.')[0].split( '_')[0] with codecs.open(infile, 'r', 'utf8') as fin: for line in fin.readlines(): sentence, translation = line.strip().split('\t') if onlysource and sentence: yield language, sentence.strip() else: yield language, sentence, translation
def documents(intarfile=parentddir+'/data/udhr/udhr-unicode.tar', \ bysentence=False): """ Yields UDHR by documents. """ for infile in read_tarfile(intarfile): #language = infile.split('/')[-1][:3] language = infile.split('/')[-1].split('-')[1].split('.')[0].split( '_')[0] with codecs.open(infile, 'r', 'utf8') as fin: if bysentence: for sentence in fin.readlines(): if sentence: yield language, sentence.strip() else: yield language, fin.read()
def source_sents(cleanedwikidir=parentddir+"/data/wikipedia/clean/"): """ USAGE: >>> cleanwiki = '/media/alvas/E418A6B618A686E0/xling/cleanedwiki/' >>> for i in source_sent(cleanwiki): >>> print i NOTE: cleanwiki should be a main directory that contains one directory for each language. And every language directory should contain at least one tarballs. Regardless of how many files each tarballs contain, it extracts the lines. P/S: I know the nest directory pathing is ugly, but i can't find a simpler way to do this =) """ from utils import read_tarfile for lang in os.listdir(cleanedwikidir): for intarfile in os.listdir(cleanedwikidir+lang): for infile in read_tarfile(cleanedwikidir+lang+"/"+intarfile): with codecs.open(infile,'r','utf8') as fin: for line in fin: yield lang, line.strip()