Esempio n. 1
0
def source_sents(intarfile=parentddir+'/data/odin/odin-all.tar'):
  """ Yield clean sentences from the clean ODIN tarball. """
  for infile in sorted(read_tarfile(intarfile)):
    language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0]
    with codecs.open(infile,'r','utf8') as fin:
      for line in fin.readlines():
        sentence = line.strip().split('\t')[0]
        yield language, sentence
Esempio n. 2
0
def source_sents(intarfile=parentddir + "/data/odin/odin-cleanest.tar"):
    """ Yield clean sentences from the clean ODIN tarball. """
    for infile in sorted(read_tarfile(intarfile)):
        language = infile.split("/")[-1].split("-")[1].split(".")[0].split("_")[0]
        with codecs.open(infile, "r", "utf8") as fin:
            for line in fin.readlines():
                sentence = line.strip().split("\t")[0]
                yield language, sentence
Esempio n. 3
0
def source_sents(intarfile=parentddir + '/data/odin/odin-all.tar'):
    """ Yield clean sentences from the clean ODIN tarball. """
    for infile in sorted(read_tarfile(intarfile)):
        language = infile.split('/')[-1].split('-')[1].split('.')[0].split(
            '_')[0]
        with codecs.open(infile, 'r', 'utf8') as fin:
            for line in fin.readlines():
                sentence = line.strip().split('\t')[0]
                yield language, sentence
Esempio n. 4
0
def source_sents(intarfile=currentdirectory()+'/data/odin/odin-all.tar'):
  """ Yield sentences from ODIN tarball. """
  for infile in sorted(read_tarfile(intarfile)):
    language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0]
    conversions = {"JPN":"jpn", "MAC":"mkd", "qgk":"grc"}
    language = conversions[language] if language in conversions else language
    with codecs.open(infile,'r','utf8') as fin:
      for line in fin.readlines():
        sentence = line.strip().split('\t')[0]
        yield language, sentence
Esempio n. 5
0
def phrases(intarfile=parentddir + "/data/omniglot/omniglotphrases.tar", onlysource=False):
    """ Yield source and tranlsation sentences from the clean Omniglot tarball. """
    for infile in read_tarfile(intarfile):
        language = infile.split("/")[-1].split("-")[1].split(".")[0].split("_")[0]
        with codecs.open(infile, "r", "utf8") as fin:
            for line in fin.readlines():
                sentence, translation = line.strip().split("\t")
                if onlysource:
                    yield language, sentence
                else:
                    yield language, sentence, translation
Esempio n. 6
0
def documents(intarfile=parentddir+'/data/udhr/udhr-unicode.tar', \
              bysentence=False):
  """ Yields UDHR by documents. """
  for infile in read_tarfile(intarfile):
    #language = infile.split('/')[-1][:3]
    language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0]
    with codecs.open(infile,'r','utf8') as fin:
      if bysentence:
        for sentence in fin.readlines():
          yield language, sentence
      else:
        yield language, fin.read()
Esempio n. 7
0
def phrases(intarfile=currentdirectory()+'/data/omniglot/omniglotphrases.tar', \
            onlysource=False):
  """ Yield source and tranlsation sentences from the clean Omniglot tarball. """
  for infile in read_tarfile(intarfile):
    language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0]
    with codecs.open(infile,'r','utf8') as fin:
      for line in fin.readlines():
        sentence, translation = line.strip().split('\t')
        if onlysource and sentence:
          yield language, sentence.strip()
        else:
          yield language, sentence, translation
Esempio n. 8
0
def phrases(intarfile=parentddir+'/data/omniglot/omniglotphrases.tar', \
            onlysource=False):
    """ Yield source and tranlsation sentences from the clean Omniglot tarball. """
    for infile in read_tarfile(intarfile):
        language = infile.split('/')[-1].split('-')[1].split('.')[0].split(
            '_')[0]
        with codecs.open(infile, 'r', 'utf8') as fin:
            for line in fin.readlines():
                sentence, translation = line.strip().split('\t')
                if onlysource and sentence:
                    yield language, sentence.strip()
                else:
                    yield language, sentence, translation
Esempio n. 9
0
def documents(intarfile=parentddir+'/data/udhr/udhr-unicode.tar', \
              bysentence=False):
    """ Yields UDHR by documents. """
    for infile in read_tarfile(intarfile):
        #language = infile.split('/')[-1][:3]
        language = infile.split('/')[-1].split('-')[1].split('.')[0].split(
            '_')[0]
        with codecs.open(infile, 'r', 'utf8') as fin:
            if bysentence:
                for sentence in fin.readlines():
                    if sentence:
                        yield language, sentence.strip()
            else:
                yield language, fin.read()
Esempio n. 10
0
def source_sents(cleanedwikidir=parentddir+"/data/wikipedia/clean/"):
  """
  USAGE:
  >>> cleanwiki = '/media/alvas/E418A6B618A686E0/xling/cleanedwiki/'
  >>> for i in source_sent(cleanwiki):
  >>>   print i
  
  NOTE:
  cleanwiki should be a main directory that contains one directory for each
  language. And every language directory should contain at least one tarballs. 
  Regardless of how many files each tarballs contain, it extracts the lines.
  
  P/S: I know the nest directory pathing is ugly, but i can't find a simpler
  way to do this =) 
  """
  from utils import read_tarfile
  for lang in os.listdir(cleanedwikidir):
    for intarfile in os.listdir(cleanedwikidir+lang):
      for infile in read_tarfile(cleanedwikidir+lang+"/"+intarfile):
        with codecs.open(infile,'r','utf8') as fin:
          for line in fin:
            yield lang, line.strip()
Esempio n. 11
0
def source_sents(cleanedwikidir=parentddir+"/data/wikipedia/clean/"):
  """
  USAGE:
  >>> cleanwiki = '/media/alvas/E418A6B618A686E0/xling/cleanedwiki/'
  >>> for i in source_sent(cleanwiki):
  >>>   print i
  
  NOTE:
  cleanwiki should be a main directory that contains one directory for each
  language. And every language directory should contain at least one tarballs. 
  Regardless of how many files each tarballs contain, it extracts the lines.
  
  P/S: I know the nest directory pathing is ugly, but i can't find a simpler
  way to do this =) 
  """
  from utils import read_tarfile
  for lang in os.listdir(cleanedwikidir):
    for intarfile in os.listdir(cleanedwikidir+lang):
      for infile in read_tarfile(cleanedwikidir+lang+"/"+intarfile):
        with codecs.open(infile,'r','utf8') as fin:
          for line in fin:
            yield lang, line.strip()