Exemple #1
0
    def upload_file(self,
                    pid=None,
                    filepath=None,
                    yes=False,
                    output_filename=None):
        """Upload file or directory to deposit by given pid."""
        bucket_id = self._get_bucket_id(pid)

        # Check if filepath is file or DIR
        if os.path.isdir(filepath):
            # If it's a DIR alert that it is going to be tarballed
            # and uploaded
            if yes or \
                    click.confirm('You are trying to upload a directory.\n'
                                  'Should we upload '
                                  'a tarball of the directory?'):
                if output_filename is None:
                    output_filename = "{pid}_{bucket_id}_{time}.tar.gz".format(
                        pid=pid,
                        bucket_id=bucket_id,
                        time=datetime.datetime.now().strftime(
                            '%b-%d-%I%M%p-%G'))
                make_tarfile(output_filename, filepath)
                filepath = output_filename
        else:
            if output_filename is None:
                output_filename = os.path.basename(filepath)

        # data = {'filename': output_filename}
        return self._make_request(
            url="files/{bucket_id}/{filename}".format(
                bucket_id=bucket_id, filename=output_filename),
            data=open(filepath, 'rb'),
            method='put',
        )
Exemple #2
0
    def tar_patches_tool(self):
        log.info('Start to tar patches_tool')
        patches_tool_path = utils.get_patches_tool_path()
        full_patch_of_patches_tool = os.path.join(patches_tool_path, SysPath.PATCHES_TOOL_TAR_GZ)
        if os.path.isfile(full_patch_of_patches_tool):
            self.remove_tar_patches_tool(full_patch_of_patches_tool)
        new_tar_of_patches_tool = os.path.join(patches_tool_path, SysPath.PATCHES_TOOL_TAR_GZ)
        utils.make_tarfile(new_tar_of_patches_tool, patches_tool_path)
        log.info('Success to tar patches_tool: <%s>' % full_patch_of_patches_tool)

        return full_patch_of_patches_tool
Exemple #3
0
def pickle2plaintext(testing=False, option="cleanest"):
    """ Converted ODIN IGTs from the .pk file into tab-delimited plaintexts."""
    # Makes a temp output directory for the individual files.
    TEMPODIN_DIR = "./tmpodin/"  # for saving the temp udhr files.
    if not os.path.exists(TEMPODIN_DIR):
        os.makedirs(TEMPODIN_DIR)

    for language, documents in sorted(load_odin_pickle()):
        tab_igts = []
        for d in documents:
            if d[0].strip() == "":
                continue
            src = remove_tags(d[0])
            # Removes heading bullets, e.g. (1)... | 1) | ( 12 ) | i. ... | A2. ...
            src = re.sub(r"^\(?\s?\w{1,5}\s*[):.]\s*", "", src)
            src = re.sub(r"^\(?\w{1,5}\s*[):.]\s*", "", src)
            src = re.sub(r"^\(?\w{1,5}\s*[):.]\s*", "", src)
            morphemes = src
            # Joins the morphemes up into words.
            words = re.sub(" *- *", "", src)

            if option == "cleanest":  # Accepts only IGTs without punctuation.
                if src == "" or any(i for i in string.punctuation if i in src):
                    continue
            elif option == "cleaner":  # Removes the example number at the end.
                patterns = [r"\(.{1,}\)", r"[\(\)]"]
                for pat in patterns:
                    src = re.sub(pat, "", src)
            else:  # Accepts IGTs as they are.
                if src == "":
                    continue

            # src, eng, gloss, cite = d[0], d[1], d[2], d[3]
            tab_igts.append([words, morphemes, remove_tags(d[1]), remove_tags(d[2]), d[3]])
        if len(tab_igts) > 0:
            with codecs.open(TEMPODIN_DIR + "odin-" + language + ".txt", "w", "utf8") as fout:
                for igt in tab_igts:
                    print >> fout, "\t".join(igt)

        if testing:
            break

    if testing:
        # Compress the utf8 UDHR files into a single tarfile in the test dir.
        try:
            make_tarfile("test/odin-" + option + ".tar", TEMPODIN_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile("../test/odin-" + option + ".tar", TEMPODIN_DIR)
    else:
        # Compresses the utf8 UDHR files into a single tarfile.
        try:
            make_tarfile("../data/odin/odin-" + option + ".tar", TEMPODIN_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile("../data/odin/odin-" + option + ".tar", TEMPODIN_DIR)
    # Remove the udhr-utf8 directory.
    shutil.rmtree(TEMPODIN_DIR)
Exemple #4
0
def clean_wikipedia(wiki_raw_dir):
    '''Clean all files in wiki_raw_dir and write clean files into
       ../data/wikipedia/clean/'''
    if not os.path.exists('../data/wikipedia/'):
        os.makedirs('../data/wikipedia/')

    WIKIPEDIA_CLEAN_DIR = '../data/wikipedia/clean/'
    TEMP_WIKIPEDIA_CLEAN_DIR = tempfile.mkdtemp()

    for root, dirnames, filenames in os.walk(wiki_raw_dir):
        for filename in filenames:
          filepath = os.path.join(root, filename)
          
          # get number for language file
          count = re.search('wiki_([\d]+).bz2', filepath).group(1)  

          # get language code from filepath
          language = re.search('\/([\w]+)wiki-', filepath).group(1)
  
          if not os.path.exists('../data/wikipedia/clean/' + language):
              os.makedirs('../data/wikipedia/clean/' + language)

          print('cleaning ' + filepath)
          with bz2.BZ2File(filepath, 'r') as openZip:
              f = openZip.read()
              
              # closing ref tags without a corresponding opening tag are a 
              # problem for BeautifulSoup3
              #uni_f = re.sub('</[^d]+.*?>', '', f)
              #uni_f = re.sub('</br', '', uni_f)
              
              uni_f = re.sub('<!\[', '', f)
              soup = BeautifulSoup('<docs>' + uni_f + '</docs>')
              doclist = soup.findAll('doc')

              with codecs.open(TEMP_WIKIPEDIA_CLEAN_DIR + '/' + language + '_'
                               + str(count), 'a', 'utf-8') as out:

                  for doc in doclist:
                      content = doc.getText()
                      cleancontent = clean(content.strip())
                      out.write(cleancontent.strip() + '\n')

                  make_tarfile(WIKIPEDIA_CLEAN_DIR + language + '/' + language
                               + '_' + str(count) + '.tar', 
                               TEMP_WIKIPEDIA_CLEAN_DIR + '/' + language + '_'
                               + str(count))
Exemple #5
0
def get_from_unicodedotorg(testing=False):
    """ Crawl and clean UDHR files from www.unicode.org . """
    TEMP_RAW_DIR = tempfile.mkdtemp()
    UDHR_DOWNLOAD = 'http://www.unicode.org/udhr/d/'
    AHREF_REGEX = '<a href="?\'?([^"\'>]*)'

    # Makes a temp output directory for the files that can be converted into utf8.
    UDHR_UTF8_DIR = './udhr-utf8/'  # for saving the temp udhr files.
    if not os.path.exists(UDHR_UTF8_DIR):
        os.makedirs(UDHR_UTF8_DIR)
    # Get the directory page from the www.unicode.org UDHR page
    unicode_page = urllib.urlopen(UDHR_DOWNLOAD).read()
    # Crawls the www.unicode.org page for all udhr txt files.
    for i in re.findall(AHREF_REGEX, unicode_page):
        if i.endswith('.txt'):
            print UDHR_DOWNLOAD + i
            urllib.urlretrieve(UDHR_DOWNLOAD + i, filename=TEMP_RAW_DIR + i)
            with io.open(TEMP_RAW_DIR + i, 'r', encoding='utf8') as udhrfile:
                # Gets the language from the end of the file line.
                lang = udhrfile.readline().partition('-')[2].strip()
                # Gets the language code from the filename.
                langcode = i.partition('.')[0].partition('_')[2]
                # Skip the header lines.
                for _ in range(5):
                    udhrfile.readline()
                # Reads the rest of the lines and that's the udhr data.
                the_rest = udhrfile.readlines()
                data = "\n".join(
                    [i.strip() for i in the_rest if i.strip() != ''])
                ##print langcode, data.split('\n')[0]
                with codecs.open(UDHR_UTF8_DIR + 'udhr-' + langcode + '.txt',
                                 'w', 'utf8') as outfile:
                    print >> outfile, data
            if testing:
                break

    if testing:
        # Compress the utf8 UDHR files into a single tarfile in the test dir.
        try:
            make_tarfile('../test/udhr-unicode.tar', UDHR_UTF8_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile('../../test/udhr-unicode.tar', UDHR_UTF8_DIR)

    else:
        # Compresses the utf8 UDHR files into a single tarfile.
        try:
            make_tarfile('../data/udhr/udhr-unicode.tar', UDHR_UTF8_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile('../../data/udhr/udhr-unicode.tar', UDHR_UTF8_DIR)
    # Remove the udhr-utf8 directory.
    shutil.rmtree(UDHR_UTF8_DIR)
Exemple #6
0
def get_from_unicodedotorg(testing=False):
  """ Crawl and clean UDHR files from www.unicode.org . """
  TEMP_RAW_DIR = tempfile.mkdtemp()
  UDHR_DOWNLOAD = 'http://www.unicode.org/udhr/d/'
  AHREF_REGEX = '<a href="?\'?([^"\'>]*)'
  
  # Makes a temp output directory for the files that can be converted into utf8.
  UDHR_UTF8_DIR = './udhr-utf8/' # for saving the temp udhr files.
  if not os.path.exists(UDHR_UTF8_DIR):
    os.makedirs(UDHR_UTF8_DIR)
  # Get the directory page from the www.unicode.org UDHR page
  unicode_page = urllib.urlopen(UDHR_DOWNLOAD).read()
  # Crawls the www.unicode.org page for all udhr txt files.
  for i in re.findall(AHREF_REGEX,unicode_page):
    if i.endswith('.txt'):
      print UDHR_DOWNLOAD+i
      urllib.urlretrieve(UDHR_DOWNLOAD+i, filename=TEMP_RAW_DIR+i)
      with io.open(TEMP_RAW_DIR+i,'r',encoding='utf8') as udhrfile:
        # Gets the language from the end of the file line.
        lang = udhrfile.readline().partition('-')[2].strip()
        # Gets the language code from the filename.
        langcode = i.partition('.')[0].partition('_')[2]
        # Skip the header lines.
        for _ in range(5): udhrfile.readline();
        # Reads the rest of the lines and that's the udhr data.
        the_rest = udhrfile.readlines()
        data = "\n".join([i.strip() for i in the_rest if i.strip() != ''])
        ##print langcode, data.split('\n')[0]
        with codecs.open(UDHR_UTF8_DIR+'udhr-'+langcode+'.txt','w','utf8') as outfile:
          print>>outfile, data
      if testing:
        break

  if testing:
    # Compress the utf8 UDHR files into a single tarfile in the test dir.
      try:
        make_tarfile('../test/udhr-unicode.tar',UDHR_UTF8_DIR)
      except IOError:
        # if function is called within the sugarlike/src/universalcorpus dir
        # To move up directory to access sugarlike/data/ and sugarlike/test/.
        make_tarfile('../../test/udhr-unicode.tar',UDHR_UTF8_DIR)
      
  else:
    # Compresses the utf8 UDHR files into a single tarfile.
    try:
      make_tarfile('../data/udhr/udhr-unicode.tar',UDHR_UTF8_DIR)
    except IOError:
      # if function is called within the sugarlike/src/universalcorpus dir
      # To move up directory to access sugarlike/data/ and sugarlike/test/.
      make_tarfile('../../data/udhr/udhr-unicode.tar',UDHR_UTF8_DIR)  
  # Remove the udhr-utf8 directory.
  shutil.rmtree(UDHR_UTF8_DIR)
Exemple #7
0
def main(args):
    if not args.skip_extractor:
        extractor = Extractor(video_folder=args.video_folder,
                              pretrained_vibe=args.pretrained_vibe,
                              pretrained_spin=args.pretrained_spin,
                              output_folder=args.extractor_results_folder,
                              render=args.render_extractor_results,
                              tracking_method=args.tracking_method,
                              staf_dir=args.staf_dir,
                              run_smplify=args.run_smplify)
        extractor.run()

    synthesiser = Synthesiser(blender=args.blender,
                              debug_blender=args.debug_blender,
                              motion_path=args.extractor_results_folder,
                              target_size=args.target_size,
                              num_frames=args.num_frames)
    synthesiser.run()

    make_tarfile(args.output, 'output')

    logging.info('Done.')
Exemple #8
0
def rename_omniglotphrase_tarfile(intarfile):
    """ Rename the files and use ISO codes instead of full language names. """
    TEMP_DIR = tempfile.mkdtemp()
    with tarfile.open(intarfile) as tf:
        for member in tf.getmembers():
            tf.extract(member, TEMP_DIR)

    TEMP_OUT_DIR = tempfile.mkdtemp()
    for infile in os.listdir(TEMP_DIR):
        _, lang = infile.split(".")
        lang = lang.split("_")[0]
        isocode = langiso(lang)
        if len(isocode) > 0:
            with codecs.open(TEMP_DIR + "/" + infile, "r", "utf8") as fin:
                fout = codecs.open(TEMP_OUT_DIR + "/omniglotphrase-" + isocode[0] + ".txt", "w", "utf8")
                for line in fin:
                    try:
                        eng, src = line.strip().split("\t")
                        print >> fout, src + "\t" + eng
                    except ValueError:
                        print lang, line
                        pass
    make_tarfile("../../data/omniglot/omniglotphrases.tar", TEMP_OUT_DIR + "/")
Exemple #9
0
def rename_omniglotphrase_tarfile(intarfile):
    """ Rename the files and use ISO codes instead of full language names. """
    TEMP_DIR = tempfile.mkdtemp()
    with tarfile.open(intarfile) as tf:
        for member in tf.getmembers():
            tf.extract(member, TEMP_DIR)

    TEMP_OUT_DIR = tempfile.mkdtemp()
    for infile in os.listdir(TEMP_DIR):
        _, lang = infile.split('.')
        lang = lang.split('_')[0]
        isocode = langiso(lang)
        if len(isocode) > 0:
            with codecs.open(TEMP_DIR + '/' + infile, 'r', 'utf8') as fin:
                fout = codecs.open(TEMP_OUT_DIR+'/omniglotphrase-'+isocode[0]+'.txt',\
                                   'w','utf8')
                for line in fin:
                    try:
                        eng, src = line.strip().split('\t')
                        print >> fout, src + "\t" + eng
                    except ValueError:
                        print lang, line
                        pass
    make_tarfile('../../data/omniglot/omniglotphrases.tar', TEMP_OUT_DIR + "/")
Exemple #10
0
def clean_wikipedia(wiki_raw_dir, option = "firstfile"):
    '''
    Clean all files in wiki_raw_dir and write clean files into
    data/wikipedia/clean/ .
    Options:
    - firstfile: cleans and stores only one folder (AA) per language. For 
      "normal" WikiExtractor setting, this corresponds to 100 files with
      5000K each. Currently this means, that for the 20 most frequent
      languages (see http://meta.wikimedia.org/wiki/List_of_Wikipedias), part
      of the data is ignored.
    - all: cleans and stores all folders 
    '''
    c = 1
    skippedcount = 1

    if not os.path.exists(wiki_raw_dir):
        print('no such path:' + wiki_raw_dir)

    if not os.path.exists('data/wikipedia/'):
        os.makedirs('data/wikipedia/')

    WIKIPEDIA_CLEAN_DIR = 'data/wikipedia/clean/'
    TEMP_WIKIPEDIA_CLEAN_DIR = tempfile.mkdtemp()

    for root, dirnames, filenames in os.walk(wiki_raw_dir):
        for filename in filenames:
          filepath = os.path.join(root, filename)

          # get number for language file and in case of option=firstfile
          # skip all files that are not in a AA folder
          count = re.search('wiki_([\d]+).bz2', filepath).group(1)
          if option == "firstfile" and not 'AA/wiki' in filepath:
              if count == '00' and 'AB/wiki' in filepath:
                  print('[option=firstfile] More files available ' + str(skippedcount) + ': ' + filepath)
                  skippedcount += 1
              continue
          
          language = get_iso(filepath)
          if language == None:
              continue            
  
          if not os.path.exists('data/wikipedia/clean/' + language):
              os.makedirs('data/wikipedia/clean/' + language)

          print('cleaning file ' + str(c) + ': ' + filepath)
          c += 1
          with bz2.BZ2File(filepath, 'r') as openZip:
              f = openZip.read()
              
              # closing ref tags without a corresponding opening tag are a 
              # problem for BeautifulSoup3
              #uni_f = re.sub('</[^d]+.*?>', '', f)
              #uni_f = re.sub('</br', '', uni_f)
              
              uni_f = re.sub('<!\[', '', f)
              soup = BeautifulSoup('<docs>' + uni_f + '</docs>')
              doclist = soup.findAll('doc')

          with codecs.open(TEMP_WIKIPEDIA_CLEAN_DIR + '/' + language + '_'
                               + str(count), 'a', 'utf-8') as out:

              for doc in doclist:
                  content = doc.getText()
                  cleancontent = clean(content.strip())
                  out.write(cleancontent.strip() + '\n')

              make_tarfile(WIKIPEDIA_CLEAN_DIR + language + '/' + language 
                  + '_' + str(count) + '.tar', TEMP_WIKIPEDIA_CLEAN_DIR + '/'
                  + language + '_' + str(count))
Exemple #11
0
def get_phrases(with_mp3=False,testing=False):
  """ Gets phrases list from Omniglot. """
  # Downloads and open the phrases index.htm on Omniglot.
  phrase_lang = urllib2.urlopen(MULTILING_URLS['phrase_lang']).read()
  
  # Makes a temp output directory to the phrases files.
  outputdir= DATADIR+'omniglot-temp/'
  if not os.path.exists(outputdir):
    os.makedirs(outputdir)
    
  for link in re.findall(AHREF_REGEX,phrase_lang):
    # Finds all link for the phrases page for each language.
    if '/language/phrases/' in link and not link.endswith('index.htm'):
      # Get name of language in English.
      langname = link.rpartition('/')[2].strip().rpartition('.')[0]
      # Create a textfile for the output.
      outfile = codecs.open(outputdir+'omnilgotphrases-'+langname+'.txt', \
                            'w','utf8')
      # Finds the section that starts with <div id="unicode">
      soup = bs(urllib2.urlopen(OMNIGLOT+link).read()).findAll(id='unicode')[0]
      # Get name of language in the particular language.
      langname2 = bs(str(soup.findAll('th')[1])).text
      all_phrases = defaultdict(list)
      
      # Each <tr>...</tr> is a phrase in the table.
      phrasetable = soup.findAll('tr')
      for phrases in phrasetable:
        try:
          # Each <td>...</td> is a column in the <tr/>.
          eng,phrase =  bs(unicode(phrases)).findAll('td')
          eng = str(eng.text)
          if with_mp3:
            # Maps the phrase to the corresponding mp3.
            phrase_mp3 = zip([i.strip() for i in \
                              unicode(phrase.text).split('\n') if i != ''],
                             re.findall(AHREF_REGEX,str(phrase)))
            all_phrases[eng]+=phrase_mp3
          else:
            all_phrases[eng]+=[i.strip() for i in \
                          unicode(phrase.text).split('\n') if i.strip() != '']
        except ValueError:
          pass
        
      # Outputs to file.
      for gloss in all_phrases:
        eng = gloss.replace('\n  ',' ').strip()
        repls ={'todance':'to dance', 'Christmasand':'Christmas and',
                'ladywill':'lady will','hovercraftis':'hovercraft is',
                'languageis':'language is'}
        eng = reduce(lambda a, kv: a.replace(*kv), repls.iteritems(), eng)
        
        
        for trg in all_phrases[gloss]:
          if type(trg) is tuple:
            trg = "\t".join(trg)
          print>>outfile, eng+"\t"+trg+"\t"+OMNIGLOT+link
          print eng+"\t"+trg+"\t"+OMNIGLOT+link
      if testing: # only process one page if testing.
        break        
      time.sleep(random.randrange(5,10))
  
  if testing:
    # Compresses the omniglot phrases files into the tarfile in the test dir.
    try:
      make_tarfile(TESTDIR+'omniglot-phrases.tar',outputdir)
    except IOError:
      make_tarfile("../"+TESTDIR+'omniglot-phrases.tar',outputdir)
  else:
    # Compresses the omniglot phrases files into a single tarfile.    
    try:
      make_tarfile(DATADIR+'omniglot/omniglot-phrases.tar',outputdir)
    except IOError:
      make_tarfile("../"+DATADIR+'omniglot/omniglot-phrases.tar',outputdir)
    
  # Remove the temp phrases directory.
  try:
    shutil.rmtree(outputdir) 
  except WindowsError:
    # If windows complain, glob through and remove file individually.
    import glob
    for f in glob.glob(outputdir):
      os.remove(f)
Exemple #12
0
def get_phrases(with_mp3=False, testing=False):
    """ Gets phrases list from Omniglot. """
    # Downloads and open the phrases index.htm on Omniglot.
    phrase_lang = urllib2.urlopen(MULTILING_URLS['phrase_lang']).read()

    # Makes a temp output directory to the phrases files.
    outputdir = DATADIR + 'omniglot-temp/'
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    for link in re.findall(AHREF_REGEX, phrase_lang):
        # Finds all link for the phrases page for each language.
        if '/language/phrases/' in link and not link.endswith('index.htm'):
            # Get name of language in English.
            langname = link.rpartition('/')[2].strip().rpartition('.')[0]
            # Create a textfile for the output.
            outfile = codecs.open(outputdir+'omnilgotphrases-'+langname+'.txt', \
                                  'w','utf8')
            # Finds the section that starts with <div id="unicode">
            soup = bs(urllib2.urlopen(OMNIGLOT +
                                      link).read()).findAll(id='unicode')[0]
            # Get name of language in the particular language.
            langname2 = bs(str(soup.findAll('th')[1])).text
            all_phrases = defaultdict(list)

            # Each <tr>...</tr> is a phrase in the table.
            phrasetable = soup.findAll('tr')
            for phrases in phrasetable:
                try:
                    # Each <td>...</td> is a column in the <tr/>.
                    eng, phrase = bs(unicode(phrases)).findAll('td')
                    eng = str(eng.text)
                    if with_mp3:
                        # Maps the phrase to the corresponding mp3.
                        phrase_mp3 = zip([i.strip() for i in \
                                          unicode(phrase.text).split('\n') if i != ''],
                                         re.findall(AHREF_REGEX,str(phrase)))
                        all_phrases[eng] += phrase_mp3
                    else:
                        all_phrases[eng]+=[i.strip() for i in \
                                      unicode(phrase.text).split('\n') if i.strip() != '']
                except ValueError:
                    pass

            # Outputs to file.
            for gloss in all_phrases:
                eng = gloss.replace('\n  ', ' ').strip()
                repls = {
                    'todance': 'to dance',
                    'Christmasand': 'Christmas and',
                    'ladywill': 'lady will',
                    'hovercraftis': 'hovercraft is',
                    'languageis': 'language is'
                }
                eng = reduce(lambda a, kv: a.replace(*kv), repls.iteritems(),
                             eng)

                for trg in all_phrases[gloss]:
                    if type(trg) is tuple:
                        trg = "\t".join(trg)
                    print >> outfile, eng + "\t" + trg + "\t" + OMNIGLOT + link
                    print eng + "\t" + trg + "\t" + OMNIGLOT + link
            if testing:  # only process one page if testing.
                break
            time.sleep(random.randrange(5, 10))

    if testing:
        # Compresses the omniglot phrases files into the tarfile in the test dir.
        try:
            make_tarfile(TESTDIR + 'omniglot-phrases.tar', outputdir)
        except IOError:
            make_tarfile("../" + TESTDIR + 'omniglot-phrases.tar', outputdir)
    else:
        # Compresses the omniglot phrases files into a single tarfile.
        try:
            make_tarfile(DATADIR + 'omniglot/omniglot-phrases.tar', outputdir)
        except IOError:
            make_tarfile("../" + DATADIR + 'omniglot/omniglot-phrases.tar',
                         outputdir)

    # Remove the temp phrases directory.
    try:
        shutil.rmtree(outputdir)
    except WindowsError:
        # If windows complain, glob through and remove file individually.
        import glob
        for f in glob.glob(outputdir):
            os.remove(f)
Exemple #13
0
from utils import make_tarfile, upload_file, file_exists_in_bucket, purge_backups
import os
import sys
import datetime as dt

log = open("backup.log", "a")
sys.stdout = log

DIRECTORY = os.environ['BACKUP_DIR']

output_path = make_tarfile(DIRECTORY)
print('Starting job @ {}'.format(dt.datetime.now()))
print('Successfully compressed {} into {}'.format(DIRECTORY, output_path))
print('Beginning upload...')

upload_file(output_path)

if not file_exists_in_bucket(output_path):
    print('\nVerification of file upload to s3 failed...exiting')
    exit()

print('\nVerified file upload to s3...Cleaning up')
print('Removing backup archive {}'.format(output_path))
os.remove(output_path)

purge_backups()
Exemple #14
0
def pickle2plaintext(testing=False, option='cleanest'):
    """ Converted ODIN IGTs from the .pk file into tab-delimited plaintexts."""
    # Makes a temp output directory for the individual files.
    TEMPODIN_DIR = './tmpodin/'  # for saving the temp udhr files.
    if not os.path.exists(TEMPODIN_DIR):
        os.makedirs(TEMPODIN_DIR)

    for language, documents in sorted(load_odin_pickle()):
        tab_igts = []
        for d in documents:
            if d[0].strip() == "": continue
            src = remove_tags(d[0])
            # Removes heading bullets, e.g. (1)... | 1) | ( 12 ) | i. ... | A2. ...
            src = re.sub(r'^\(?\s?\w{1,5}\s*[):.]\s*', '', src)
            src = re.sub(r'^\(?\w{1,5}\s*[):.]\s*', '', src)
            src = re.sub(r'^\(?\w{1,5}\s*[):.]\s*', '', src)
            morphemes = src
            # Joins the morphemes up into words.
            words = re.sub(' *- *', '', src)

            if option == 'cleanest':  # Accepts only IGTs without punctuation.
                if src == '' or any(i for i in string.punctuation if i in src):
                    continue
            elif option == 'cleaner':  # Removes the example number at the end.
                patterns = [r"\(.{1,}\)", r"[\(\)]"]
                for pat in patterns:
                    src = re.sub(pat, '', src)
            else:  # Accepts IGTs as they are.
                if src == '':
                    continue

            # src, eng, gloss, cite = d[0], d[1], d[2], d[3]
            tab_igts.append([words, morphemes, remove_tags(d[1]), \
                  remove_tags(d[2]), d[3]])
        if len(tab_igts) > 0:
            with codecs.open(TEMPODIN_DIR + 'odin-' + language + '.txt', 'w',
                             'utf8') as fout:
                for igt in tab_igts:
                    print >> fout, "\t".join(igt)

        if testing:
            break

    if testing:
        # Compress the utf8 UDHR files into a single tarfile in the test dir.
        try:
            make_tarfile('test/odin-' + option + '.tar', TEMPODIN_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile('../test/odin-' + option + '.tar', TEMPODIN_DIR)
    else:
        # Compresses the utf8 UDHR files into a single tarfile.
        try:
            make_tarfile('../data/odin/odin-' + option + '.tar', TEMPODIN_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile('../data/odin/odin-' + option + '.tar', TEMPODIN_DIR)
    # Remove the udhr-utf8 directory.
    shutil.rmtree(TEMPODIN_DIR)