コード例 #1
0
ファイル: generate.py プロジェクト: bhack/Dali
def encrypt_archive(archive_name, directory_name):
    if exists(archive_name) and isfile(archive_name):
        filesize = stat(archive_name).st_size
        if filesize < 100:
            print(yellowify("Present archive has wrong size (%d bytes). Removing." % (filesize,)))
            execute_bash("rm %s" % (archive_name,))

    if not exists(archive_name) or not isfile(archive_name):
        if exists(directory_name) and isdir(directory_name):
            print(boldify("Encrypting data"))
            execute_bash("sh %s encrypt" % (ENCRYPT_SCRIPT))
            print(greenify("Successfully Encrypted data"))
コード例 #2
0
def encrypt_archive(archive_name, directory_name):
    if exists(archive_name) and isfile(archive_name):
        filesize = stat(archive_name).st_size
        if filesize < 100:
            print(
                yellowify(
                    "Present archive has wrong size (%d bytes). Removing." %
                    (filesize, )))
            execute_bash("rm %s" % (archive_name, ))

    if not exists(archive_name) or not isfile(archive_name):
        if exists(directory_name) and isdir(directory_name):
            print(boldify("Encrypting data"))
            execute_bash("sh %s encrypt" % (ENCRYPT_SCRIPT))
            print(greenify("Successfully Encrypted data"))
コード例 #3
0
ファイル: generate.py プロジェクト: bhack/Dali
from os import listdir
from os.path import isdir, isfile, join, dirname, realpath


# add data to path
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))

# important
ZIP_URL = 'http://cs224d.stanford.edu/assignment2/assignment2.zip'
ZIP_LOCAL = join(THIS_DATA_DIR, 'assignment2.zip')
UNZIPPED_LOCAL = join(THIS_DATA_DIR, "assignment2")

def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path,))


if __name__ == '__main__':
    local_files = [join(THIS_DATA_DIR, f) for f in ['train.txt', 'test.txt', 'dev.txt']]
    delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")] + local_files)
    execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL))
    execute_bash('unzip %s -d %s' % (ZIP_LOCAL, UNZIPPED_LOCAL))
    execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "train"), join(THIS_DATA_DIR, "train.txt")))
    execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "test.masked"), join(THIS_DATA_DIR, "test.txt")))
    execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "dev"), join(THIS_DATA_DIR, "dev.txt")))
    delete_paths([ZIP_LOCAL, UNZIPPED_LOCAL])
コード例 #4
0
ファイル: generate.py プロジェクト: bzcheeseman/Dali
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))

# important
ZIP_URL = 'http://cs224d.stanford.edu/assignment2/assignment2.zip'
ZIP_LOCAL = join(THIS_DATA_DIR, 'assignment2.zip')
UNZIPPED_LOCAL = join(THIS_DATA_DIR, "assignment2")


def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path, ))


if __name__ == '__main__':
    local_files = [
        join(THIS_DATA_DIR, f) for f in ['train.txt', 'test.txt', 'dev.txt']
    ]
    delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")] + local_files)
    execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL))
    execute_bash('unzip %s -d %s' % (ZIP_LOCAL, UNZIPPED_LOCAL))
    execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "train"),
                               join(THIS_DATA_DIR, "train.txt")))
    execute_bash('mv %s %s' %
                 (join(UNZIPPED_LOCAL, "data", "ner",
                       "test.masked"), join(THIS_DATA_DIR, "test.txt")))
    execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner",
                                    "dev"), join(THIS_DATA_DIR, "dev.txt")))
    delete_paths([ZIP_LOCAL, UNZIPPED_LOCAL])
コード例 #5
0
def decrypt_archive():
    print(boldify("Decrypting data"))
    execute_bash("sh %s decrypt" % (ENCRYPT_SCRIPT))
コード例 #6
0
ファイル: generate.py プロジェクト: bhack/Dali
import random
import sys

from os import listdir
from os.path import isdir, isfile, join, dirname, realpath

# add data to path
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))

# important
ZIP_URL = 'http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
ZIP_LOCAL = join(THIS_DATA_DIR, 'trainDevTestTrees_PTB.zip')

def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path,))


if __name__ == '__main__':
    local_files = [join(THIS_DATA_DIR, f) for f in ['train.txt', 'test.txt', 'dev.txt']]
    delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")] + local_files)
    execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL))
    execute_bash('unzip %s -d %s' % (ZIP_LOCAL, THIS_DATA_DIR))
    execute_bash('mv %s %s' % (join(THIS_DATA_DIR, "trees", "*"), THIS_DATA_DIR))
    delete_paths([ZIP_LOCAL,join(THIS_DATA_DIR, "trees")])
コード例 #7
0
ファイル: generate.py プロジェクト: bzcheeseman/Dali
def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path, ))
コード例 #8
0
ファイル: generate.py プロジェクト: bhack/Dali
]

temp_dir = join(THIS_DATA_DIR, "tmp")

def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path,))

if __name__ == '__main__':
    if not exists(temp_dir):
        makedirs(temp_dir)
    # delete existing downloads
    delete_paths([out for url, path, out, newdir in urls] + [path for url, path, out, newdir in urls] + [newdir for url, path, out, newdir in urls])
    # download new ones
    for url, path, out, newdir in urls:
        execute_bash("wget -O {path} {url}".format(url=url, path=path))
        execute_bash("tar -xf %s -C %s" % (path, THIS_DATA_DIR))
        execute_bash("rm %s"            % join(out, "00-readme.txt"))
        execute_bash("mv {dataset_files} {tmp_dir}".format(
            tmp_dir=temp_dir,
            dataset_files=join(out, "*.txt"))
        )
        delete_paths([out])
        if not exists(newdir):
            makedirs(newdir)
        execute_bash("mv {tmp_files} {newdir}".format(
            tmp_files = join(temp_dir, "*.txt"),
            newdir = newdir)
        )
    delete_paths([temp_dir] + [path for url, path, out, newdir in urls])
コード例 #9
0
def cleanup():
    execute_bash('rm -rf wikianswer.tar.gz %s*' % (DATASET_DIR,))
コード例 #10
0
# not so important
DATASET_DIR = 'Question_Answer_Dataset'
DATASET_FILE = 'question_answer_pairs.txt'
MIN_ANSWER_LENGTH = 1

# Who does that?
WINDOWS_ENCODING = 'latin-1'

def cleanup():
    execute_bash('rm -rf wikianswer.tar.gz %s*' % (DATASET_DIR,))


if __name__ == '__main__':
    cleanup()

    execute_bash('wget -O wikianswer.tar.gz %s' % (TARBALL,))
    execute_bash('tar -xz -f wikianswer.tar.gz')
    execute_bash('mv Question_Answer_Dataset* %s' % (DATASET_DIR,))

    directories = [d for d in listdir(DATASET_DIR)
                   if isdir(join(DATASET_DIR, d)) and d.startswith('S')]

    assert len(directories) > 0

    num_nonascii = 0
    num_too_short = 0
    output_content = []
    for d in [join(DATASET_DIR, d) for d in directories]:
        dataset_file = join(d, DATASET_FILE)
        assert isfile(dataset_file)
        with open(dataset_file, newline='', encoding=WINDOWS_ENCODING) as f:
コード例 #11
0

def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path, ))


if __name__ == '__main__':
    delete_paths([
        ZIP_LOCAL,
        join(THIS_DATA_DIR, "MCTest"),
        join(THIS_DATA_DIR, '*.{tsv,ans}')
    ])
    delete_paths([
        ZIP_LOCAL,
        join(THIS_DATA_DIR, "MCTestAnswers"),
        join(THIS_DATA_DIR, '*.{tsv,ans}')
    ])
    execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL))
    execute_bash('unzip %s -d %s' % (ZIP_LOCAL, THIS_DATA_DIR))
    execute_bash('wget -O %s %s' % (ZIP_TEST_LOCAL, ZIP_URL_TEST))
    execute_bash('unzip %s -d %s' % (ZIP_TEST_LOCAL, THIS_DATA_DIR))
    execute_bash(
        'mv %s %s' %
        (join(THIS_DATA_DIR, "MCTest*", "*.{tsv,ans}"), THIS_DATA_DIR))
    delete_paths([ZIP_LOCAL, ZIP_TEST_LOCAL, join(THIS_DATA_DIR, "MCTest*")])
    delete_paths(
        [join(THIS_DATA_DIR, TRAIN_FILE),
         join(THIS_DATA_DIR, TEST_FILE)])
    parse(THIS_DATA_DIR, TRAIN_FILE, TEST_FILE)
コード例 #12
0
ファイル: generate.py プロジェクト: bhack/Dali
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))

TRAIN_FILE = join(THIS_DATA_DIR, "mc_train.txt")
TEST_FILE = join(THIS_DATA_DIR, "mc_test.txt")

# important
ZIP_URL      = 'http://research.microsoft.com/en-us/um/redmond/projects/mctest/data/MCTest.zip'
ZIP_URL_TEST = 'http://research.microsoft.com/en-us/um/redmond/projects/mctest/data/MCTestAnswers.zip'
ZIP_LOCAL = join(THIS_DATA_DIR, 'MCTest.zip')
ZIP_TEST_LOCAL = join(THIS_DATA_DIR, 'MCTestAnswers.zip')

def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path,))

if __name__ == '__main__':
    delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "MCTest"), join(THIS_DATA_DIR, '*.{tsv,ans}')])
    delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "MCTestAnswers"), join(THIS_DATA_DIR, '*.{tsv,ans}')])
    execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL))
    execute_bash('unzip %s -d %s' % (ZIP_LOCAL, THIS_DATA_DIR))
    execute_bash('wget -O %s %s' % (ZIP_TEST_LOCAL, ZIP_URL_TEST))
    execute_bash('unzip %s -d %s' % (ZIP_TEST_LOCAL, THIS_DATA_DIR))
    execute_bash('mv %s %s' % (join(THIS_DATA_DIR, "MCTest*", "*.{tsv,ans}"), THIS_DATA_DIR))
    delete_paths([ZIP_LOCAL,ZIP_TEST_LOCAL, join(THIS_DATA_DIR, "MCTest*")])
    delete_paths([join(THIS_DATA_DIR, TRAIN_FILE), join(THIS_DATA_DIR, TEST_FILE)])
    parse(THIS_DATA_DIR, TRAIN_FILE, TEST_FILE)
コード例 #13
0
ファイル: generate.py プロジェクト: bhack/Dali
import random
import sys

from os import listdir, makedirs, stat
from os.path import isdir, isfile, join, dirname, realpath, exists

# add data to path
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))
DOWNLOADED_FILE = join(THIS_DATA_DIR, "wikianswers.paraphrases.tsv.gz")
FILE_URL="https://www.dropbox.com/s/td3ionbuj80hrkb/wikianswers.paraphrases.tsv.gz?dl=0"

if __name__ == '__main__':
    if exists(DOWNLOADED_FILE):
        print("Found file.")
    else:
        execute_bash("wget -O {path} {url}".format(url=FILE_URL, path=DOWNLOADED_FILE))
        print("Downloaded file")

コード例 #14
0
ファイル: generate.py プロジェクト: bhack/Dali
sys.path.append(DATA_DIR)
from utils import execute_bash, collect_files_with_ext
from midi.utils import midiread


from scipy.io import wavfile
import numpy as np

THIS_DATA_DIR  = dirname(realpath(__file__))
DOWNLOADED_ZIP = join(THIS_DATA_DIR, "dataset.zip")
DOWNLOADED_DIR = join(THIS_DATA_DIR, "dataset")
FILE_URL="http://c4dm.eecs.qmul.ac.uk/rdr/bitstream/handle/123456789/13/Score-informed%20Piano%20Transcription%20Dataset.zip?sequence=1"

if __name__ == '__main__':
    if not exists(DOWNLOADED_ZIP):
        execute_bash("wget -O {path} {url}".format(url=FILE_URL, path=DOWNLOADED_ZIP))
    if exists(DOWNLOADED_DIR) and isdir(DOWNLOADED_DIR):
        execute_bash("rm -rf %s" % (DOWNLOADED_DIR))
    execute_bash("rm %s " % (join(THIS_DATA_DIR, "*.npy")))
    makedirs(DOWNLOADED_DIR)
    execute_bash("unzip %s -d %s" % (DOWNLOADED_ZIP, DOWNLOADED_DIR))

    files = collect_files_with_ext(DOWNLOADED_DIR, ".wav")

    for subpath, name in files:
        if name.endswith(".wav") and "Chromatic" not in name:
            sampling_rate, music = wavfile.read(subpath)
            np.save(join(THIS_DATA_DIR, name.replace(".wav", ".npy")), music)
            piece = midiread(str(subpath).replace(".wav", "_correct.mid"))
            np.save(join(THIS_DATA_DIR, name.replace(".wav", ".mid.npy")), piece.piano_roll)
コード例 #15
0
temp_dir = join(THIS_DATA_DIR, "tmp")


def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path, ))


if __name__ == '__main__':
    if not exists(temp_dir):
        makedirs(temp_dir)
    # delete existing downloads
    delete_paths([out for url, path, out, newdir in urls] +
                 [path for url, path, out, newdir in urls] +
                 [newdir for url, path, out, newdir in urls])
    # download new ones
    for url, path, out, newdir in urls:
        execute_bash("wget -O {path} {url}".format(url=url, path=path))
        execute_bash("tar -xf %s -C %s" % (path, THIS_DATA_DIR))
        execute_bash("rm %s" % join(out, "00-readme.txt"))
        execute_bash("mv {dataset_files} {tmp_dir}".format(tmp_dir=temp_dir,
                                                           dataset_files=join(
                                                               out, "*.txt")))
        delete_paths([out])
        if not exists(newdir):
            makedirs(newdir)
        execute_bash("mv {tmp_files} {newdir}".format(tmp_files=join(
            temp_dir, "*.txt"),
                                                      newdir=newdir))
    delete_paths([temp_dir] + [path for url, path, out, newdir in urls])
コード例 #16
0
ファイル: generate.py プロジェクト: bhack/Dali
import random
import sys

from os import listdir
from os.path import isdir, isfile, join, dirname, realpath

# add data to path
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))

# important
URL = 'http://norvig.com/big.txt'
LOCAL = join(THIS_DATA_DIR, 'big.txt')

def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path,))

if __name__ == '__main__':
    delete_paths([LOCAL])
    execute_bash('wget -O %s %s' % (LOCAL, URL))
コード例 #17
0
ファイル: generate.py プロジェクト: NorthStar/Dali
from os import listdir
from os.path import isdir, isfile, join, dirname, realpath

# add data to path
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))

# important
ZIP_URL = 'http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
ZIP_LOCAL = join(THIS_DATA_DIR, 'trainDevTestTrees_PTB.zip')


def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path, ))


if __name__ == '__main__':
    local_files = [
        join(THIS_DATA_DIR, f) for f in ['train.txt', 'test.txt', 'dev.txt']
    ]
    delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")] + local_files)
    execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL))
    execute_bash('unzip %s -d %s' % (ZIP_LOCAL, THIS_DATA_DIR))
    execute_bash('mv %s %s' %
                 (join(THIS_DATA_DIR, "trees", "*"), THIS_DATA_DIR))
    delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")])
コード例 #18
0
                with open(dataset_fname, "rt")                                         as finputs:
                    with open(dataset_fname.replace(".input.", ".gs."), "rt")          as flabels:
                        label_lines = (line for line in flabels)
                        input_lines = (line for line in finputs)

                        for label, sentences in zip(label_lines, input_lines):
                            fout.write(sentences.strip() + "\t" + label)
                            if tokenizer_available:
                                ftokenized.write(tokenize_sentences(sentences) + "\t" + label)

def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path,))

if __name__ == '__main__':
    execute_bash("rm -rf %s" % (ZIP_LOCAL,))
    execute_bash("rm -rf %s" % (UNZIPPED_LOCAL,))
    execute_bash("wget -O {path} {url}".format(url=ZIP_URL, path=ZIP_LOCAL))
    execute_bash("unzip {zipfile} -d {target}".format(zipfile=ZIP_LOCAL, target=UNZIPPED_LOCAL))


    # create test set:
    test_input_names = collect_text_files(UNZIPPED_LOCAL)
    transform_files_into_one(test_input_names, TEST_TOKENIZED_FILE)
    if not tokenizer_available:
        execute_bash("rm %s" % (TEST_TOKENIZED_FILE,))
    delete_paths([path for path, name in test_input_names])

    # untar train files:
    tar_files = collect_files_with_ext(UNZIPPED_LOCAL, ".tgz")
    for tar_file, tar_file_name in tar_files:
コード例 #19
0
ファイル: generate.py プロジェクト: bzcheeseman/Dali
import random
import sys

from os import listdir
from os.path import isdir, isfile, join, dirname, realpath

# add data to path
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))

# important
URL = 'http://norvig.com/big.txt'
LOCAL = join(THIS_DATA_DIR, 'big.txt')


def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path, ))


if __name__ == '__main__':
    delete_paths([LOCAL])
    execute_bash('wget -O %s %s' % (LOCAL, URL))
コード例 #20
0
ファイル: generate.py プロジェクト: bzcheeseman/Dali
import random
import sys

from os import listdir
from os.path import isdir, isfile, join, dirname, realpath


# add data to path
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))

# important
TEXT_URL = 'https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt'
UNZIPPED_LOCAL = join(THIS_DATA_DIR, "train.txt")

def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path,))

if __name__ == '__main__':
    delete_paths([UNZIPPED_LOCAL])
    execute_bash('wget -O %s %s' % (TEXT_URL, UNZIPPED_LOCAL))
コード例 #21
0
ファイル: generate.py プロジェクト: bzcheeseman/Dali
import random
import sys

from os import listdir, makedirs, stat
from os.path import isdir, isfile, join, dirname, realpath, exists

# add data to path
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import print_progress, execute_bash

THIS_DATA_DIR = dirname(realpath(__file__))
DOWNLOADED_FILE = join(THIS_DATA_DIR, "wikianswers.paraphrases.tsv.gz")
FILE_URL = "https://www.dropbox.com/s/td3ionbuj80hrkb/wikianswers.paraphrases.tsv.gz?dl=0"

if __name__ == '__main__':
    if exists(DOWNLOADED_FILE):
        print("Found file.")
    else:
        execute_bash("wget -O {path} {url}".format(url=FILE_URL,
                                                   path=DOWNLOADED_FILE))
        print("Downloaded file")
コード例 #22
0
DATA_DIR = dirname(dirname(realpath(__file__)))
sys.path.append(DATA_DIR)
from utils import execute_bash, collect_files_with_ext
from midi.utils import midiread

from scipy.io import wavfile
import numpy as np

THIS_DATA_DIR = dirname(realpath(__file__))
DOWNLOADED_ZIP = join(THIS_DATA_DIR, "dataset.zip")
DOWNLOADED_DIR = join(THIS_DATA_DIR, "dataset")
FILE_URL = "http://c4dm.eecs.qmul.ac.uk/rdr/bitstream/handle/123456789/13/Score-informed%20Piano%20Transcription%20Dataset.zip?sequence=1"

if __name__ == '__main__':
    if not exists(DOWNLOADED_ZIP):
        execute_bash("wget -O {path} {url}".format(url=FILE_URL,
                                                   path=DOWNLOADED_ZIP))
    if exists(DOWNLOADED_DIR) and isdir(DOWNLOADED_DIR):
        execute_bash("rm -rf %s" % (DOWNLOADED_DIR))
    execute_bash("rm %s " % (join(THIS_DATA_DIR, "*.npy")))
    makedirs(DOWNLOADED_DIR)
    execute_bash("unzip %s -d %s" % (DOWNLOADED_ZIP, DOWNLOADED_DIR))

    files = collect_files_with_ext(DOWNLOADED_DIR, ".wav")

    for subpath, name in files:
        if name.endswith(".wav") and "Chromatic" not in name:
            sampling_rate, music = wavfile.read(subpath)
            np.save(join(THIS_DATA_DIR, name.replace(".wav", ".npy")), music)
            piece = midiread(str(subpath).replace(".wav", "_correct.mid"))
            np.save(join(THIS_DATA_DIR, name.replace(".wav", ".mid.npy")),
                    piece.piano_roll)
コード例 #23
0
ファイル: generate.py プロジェクト: bhack/Dali
def delete_paths(paths):
    for path in paths:
        execute_bash('rm -rf %s' % (path,))
コード例 #24
0
ファイル: generate.py プロジェクト: bhack/Dali
    with open(fname, "w") as f:
        for line in all_file.split('\n'):
            line = line.replace('\t', ' ')
            if '?' in line:
                no_and_question, rest = line.split('?')
                rest = rest.strip()
                rest = rest.split(' ')
                answer = rest[0].replace(',', ' ')
                rest = ' '.join(rest[1:])
                line = '%s ?\t%s\t%s' % (no_and_question.strip(),
                                           answer.strip()+ ' .',
                                           rest.strip())
            else:
                line = ' .'.join(line.split('.'))
            f.write(line + '\n')
        f.flush()

if __name__ == '__main__':
    delete_paths([TARBALL_LOCAL, OUTPUT_DIR])
    execute_bash('rm -rf %s' % (NORMALIZED_OUTPUT_DIR,))
    execute_bash('wget -O %s %s' % (TARBALL_LOCAL, TARBALL_URL))
    execute_bash('mkdir %s' % (OUTPUT_DIR,))
    execute_bash('tar -xz -f %s -C %s' % (TARBALL_LOCAL, THIS_DATA_DIR))
    execute_bash('mv %s %s' % (OUTPUT_DIR, NORMALIZED_OUTPUT_DIR))
    delete_paths([TARBALL_LOCAL,])

    for f in findfiles(NORMALIZED_OUTPUT_DIR, "qa*.txt"):
        print("Converting %s to common QA format..." % (f,))
        fix_babi_file(f)
コード例 #25
0
ファイル: generate.py プロジェクト: bhack/Dali
def decrypt_archive():
    print(boldify("Decrypting data"))
    execute_bash("sh %s decrypt" % (ENCRYPT_SCRIPT))
コード例 #26
0
ファイル: generate.py プロジェクト: bzcheeseman/Dali
        for line in all_file.split('\n'):
            line = line.replace('\t', ' ')
            if '?' in line:
                no_and_question, rest = line.split('?')
                rest = rest.strip()
                rest = rest.split(' ')
                answer = rest[0].replace(',', ' ')
                rest = ' '.join(rest[1:])
                line = '%s ?\t%s\t%s' % (no_and_question.strip(),
                                         answer.strip() + ' .', rest.strip())
            else:
                line = ' .'.join(line.split('.'))
            f.write(line + '\n')
        f.flush()


if __name__ == '__main__':
    delete_paths([TARBALL_LOCAL, OUTPUT_DIR])
    execute_bash('rm -rf %s' % (NORMALIZED_OUTPUT_DIR, ))
    execute_bash('wget -O %s %s' % (TARBALL_LOCAL, TARBALL_URL))
    execute_bash('mkdir %s' % (OUTPUT_DIR, ))
    execute_bash('tar -xz -f %s -C %s' % (TARBALL_LOCAL, THIS_DATA_DIR))
    execute_bash('mv %s %s' % (OUTPUT_DIR, NORMALIZED_OUTPUT_DIR))
    delete_paths([
        TARBALL_LOCAL,
    ])

    for f in findfiles(NORMALIZED_OUTPUT_DIR, "qa*.txt"):
        print("Converting %s to common QA format..." % (f, ))
        fix_babi_file(f)