def build(opt): version = 'v1.1' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "twitter_en_big.txt.gz.partaa" fname2 = "twitter_en_big.txt.gz.partab" url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) file1 = os.path.join(dpath, fname1) file2 = os.path.join(dpath, fname2) file3 = "twitter_en_big.txt.gz" outzipfile = os.path.join(dpath, file3) build_data.cat(file1, file2, outzipfile) import gzip with gzip.open(outzipfile, 'r') as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) os.remove(outzipfile) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "twitter_en_big.txt.gz.partaa" fname2 = "twitter_en_big.txt.gz.partab" url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) file1 = os.path.join(dpath, fname1) file2 = os.path.join(dpath, fname2) file3 = "twitter_en_big.txt.gz" outzipfile= os.path.join(dpath, file3) build_data.cat(file1, file2, outzipfile) import gzip with gzip.open(outzipfile, 'r') as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) os.remove(outzipfile) # Mark the data as built. build_data.mark_done(dpath, version)