def get_analogy_data(): """ Get SAT-type dataset: a list of (answer: int, prompts: list, stem: list, choice: list)""" cache_dir = './cache' os.makedirs(cache_dir, exist_ok=True) root_url_analogy = 'https://github.com/asahi417/AnalogyTools/releases/download/0.0.0/analogy_test_dataset.tar.gz' if not os.path.exists('{}/analogy_test_dataset'.format(cache_dir)): wget(root_url_analogy, cache_dir) data = {} for d in ['bats', 'sat', 'u2', 'u4', 'google']: with open('{}/analogy_test_dataset/{}/test.jsonl'.format(cache_dir, d), 'r') as f: test_set = list( filter( None, map(lambda x: json.loads(x) if len(x) > 0 else None, f.read().split('\n')))) with open( '{}/analogy_test_dataset/{}/valid.jsonl'.format(cache_dir, d), 'r') as f: val_set = list( filter( None, map(lambda x: json.loads(x) if len(x) > 0 else None, f.read().split('\n')))) data[d] = (val_set, test_set) return data
def get_lexical_relation_data(): """ get dataset """ cache_dir = 'cache' os.makedirs(cache_dir, exist_ok=True) root_url_analogy = 'https://github.com/asahi417/AnalogyTools/releases/download/0.0.0/lexical_relation_dataset.tar.gz' if not os.path.exists('{}/lexical_relation_dataset'.format(cache_dir)): wget(root_url_analogy, cache_dir) full_data = {} for i in glob('{}/lexical_relation_dataset/*'.format(cache_dir)): if not os.path.isdir(i): continue full_data[os.path.basename(i)] = {} label = {} for t in glob('{}/*tsv'.format(i)): with open(t) as f: data = [ line.split('\t') for line in f.read().split('\n') if len(line) > 0 ] x = [d[:2] for d in data] y = [d[-1] for d in data] for _y in y: if _y not in label: label[_y] = len(label) y = [label[_y] for _y in y] full_data[os.path.basename(i)][os.path.basename(t).replace( '.tsv', '')] = { 'x': x, 'y': y } full_data[os.path.basename(i)]['label'] = label return full_data
def update_bug_data(update_anyway, cache_dir, bug_types, bugs_update_period_in_days, verbose=False): """Download bug if what's available is too old. ``update_anyway'' will download data regardless of how old they are ``cache_dir'' where to store the downloaded files ``bug_types'' what bug types to download data for (full names) ``bugs_update_period_in_days'' update bug data if the previously downloaded data is older than this period ``verbose'' whether to print diagnostic messages """ assert os.path.isdir(cache_dir) # see which bug files are missing or have to be updated, if any all_files = ["%s/%s.html" % (cache_dir, bt) for bt in bug_types] if update_anyway: files_to_update = all_files else: bugs_mtime_threshold = bugs_update_period_in_days * 86400 files_to_update = [f for f in all_files \ if not younger_than(f, bugs_mtime_threshold)] if not files_to_update: if verbose: print "using previously downloaded bug data" return url_paths = ["/devel/wnpp/%s" % os.path.basename(f) for f in files_to_update] wget("www.debian.org", url_paths, cache_dir, verbose)
def get_pair_relative(cache_dir: str = './cache'): """ Get the list of word pairs in RELATIVE pretrained model """ path = '{}/relative-init_wikipedia_en_300d.bin'.format(cache_dir) if not os.path.exists(path): url = 'https://drive.google.com/u/0/uc?id=1HVJnTjcaQ3aCLdwTZwiGLpMDyEylx-zS&export=download' wget(url, cache_dir, gdrive_filename='relative-init_wikipedia_en_300d.bin') model = KeyedVectors.load_word2vec_format(path, binary=True) return list(map(lambda x: [i.replace('_', ' ') for i in x.split('__')], model.vocab.keys()))
def getInstallScripts(installDir,version="CURRENT"): cwd=os.getcwd() os.chdir(installDir) url=BASE_URL+version+"/install-scripts.tar.gz" info("Fetching install scripts") wget(url,"install-scripts.tar.gz",CONSOLE) os.chdir(cwd) return os.path.lexists( os.path.join(installDir, "install-scripts.tar.gz") )
def update_popcon_data(update_anyway, cache_dir, popcon_update_period_in_days, verbose=False): filename = "%s/%s" % (cache_dir, POPCON_FNAME) max_age = popcon_update_period_in_days * 86400 if update_anyway or not younger_than(filename, max_age): url_paths = ["/%s.gz" % POPCON_FNAME] wget("popcon.debian.org", url_paths, cache_dir, verbose) decompress_gzip("%s.gz" % filename) elif verbose: print "using previously downloaded popcon data"
def __call__(self, source, path): try: url = source['url'] print("start to download model from http url(%s) to (%s)" % (url, path)) util.wget(path, url) print("end to download model from http url(%s) to (%s)" % (url, path)) except Exception as e: print(e) raise RuntimeError('Invalid url(wget from url fail reason(%s))' % e.message)
def get_binary(platform): url = ARGS.BIN_URL url=url.replace("$PLATFORM",platform) if (ARGS.is_development): url=url.replace("$TYPE","D") url=url.replace("CURRENT","Development") else: url=url.replace("$TYPE",ARGS.NEWEST_VERSION) print_console( "Downloading binaries for platform \"" + platform + "\" to " + ARGS.install_dir + " from " + url) wget(url, platform+".tar.gz",CONSOLE)
def get_binary(platform): url = BIN_URL url=url.replace("$PLATFORM",platform) if (is_development): url=url.replace("$TYPE","D") url=url.replace("CURRENT","Development") else: from detect import detect_latest_version url=url.replace("$TYPE",detect_latest_version()) info( "Downloading binaries for platform \"" + platform + "\" to " + installDir + " from " + url) wget(url, platform+".tar.gz",CONSOLE)
def download_stock(args): stock, output_dir = args from_date = datetime.date(1900, 1, 1) to_date = datetime.date(2099, 1, 1) url = DATA_BASE_URL.format(stock, to_date.month - 1, to_date.day, to_date.year, from_date.month - 1, from_date.day, from_date.year) print(url) output_path = os.path.join(output_dir, stock + '.csv') wget(url, output_path)
def get_albums(ting_uid): whole = [] for start in itertools.count(step=10): url = URL % (start, ting_uid) page = wget(url) albums = parse(page) if not albums: break whole.extend(albums) return whole
def get_framework(): url=None os.chdir(ARGS.install_dir) if(ARGS.isGitDev): ARGS.GITDEV_HASHMAP["h"]=ARGS.selectedTag[1] params=urllib.urlencode(ARGS.GITDEV_HASHMAP) ARGS.FRAMEWORK_URL=ARGS.GITDEV_URL+"%s"%params url=ARGS.FRAMEWORK_URL print_console("Using Git: Downloading development framework to " + ARGS.install_dir + " from " + url) print_console("Please wait... a fresh archive is being cooked up...") else: if (not ARGS.is_mini and not ARGS.is_development): url=ARGS.FRAMEWORK_URL url=url.replace("$TYPE",ARGS.NEWEST_VERSION) print_console( "Downloading birch framework to " + ARGS.install_dir + " from " + url) elif (not ARGS.is_development and ARGS.is_mini): url=ARGS.MINI_URL url=url.replace("$TYPE",ARGS.NEWEST_VERSION) print_console( "Downloading miniBirch framework to " + ARGS.install_dir + " from " + url) elif(ARGS.is_development and not ARGS.is_mini): url=ARGS.FRAMEWORK_URL url=url.replace("$TYPE","D") url=url.replace("CURRENT","Development") print_console( "Downloading development framework to " + ARGS.install_dir + " from " + url) elif(ARGS.is_development and ARGS.is_mini): url=ARGS.MINI_URL url=url.replace("$TYPE","D") url=url.replace("CURRENT","Development") print_console( "Downloading miniBirch development framework to " + ARGS.install_dir + " from " + url) wget(url, "framework.tar.gz",CONSOLE)
def fetch_album(url): page = wget(url) alb = parse_album(page) path = os.path.join(esc(alb['singer']), esc(alb['name'])) alb['path'] = path for link in alb['links']: fname = '%s_%s.mp3' % (link['idx'], esc(link['title'])) link['fname'] = fname alb['links'].append({'url': alb['cover']}) return alb
def getFramework(installDir,is_development): url=None os.chdir(installDir) if (not is_development): from detect import detect_latest_version url=FRAMEWORK_URL url=url.replace("$TYPE",detect_latest_version()) info("Downloading birch framework to " + installDir + " from " + url) # elif (not is_development and is_mini): # # url=ARGS.MINI_URL # url=url.replace("$TYPE",ARGS.NEWEST_VERSION) # info( "Downloading miniBirch framework to " + installDir + " from " + url) elif(is_development ): url=FRAMEWORK_URL url=url.replace("$TYPE","D") url=url.replace("CURRENT","Development") info( "Downloading development framework to " + installDir + " from " + url) # elif(is_development and is_mini): # url=ARGS.MINI_URL # url=url.replace("$TYPE","D") # url=url.replace("CURRENT","Development") # info( "Downloading miniBirch development framework to " + installDir + " from " + url) wget(url, "framework.tar.gz",CONSOLE) return os.path.lexists( os.path.join(installDir, "framework.tar.gz") )
def setup_taxonomy_data(): taxonomy_directory = os.path.join(env.data_files, "taxonomy") env.safe_sudo("mkdir -p '%s'" % taxonomy_directory, user=env.user) with cd(taxonomy_directory): taxonomy_url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" gi_taxid_nucl = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz" gi_taxid_prot = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz" wget(taxonomy_url) wget(gi_taxid_nucl) wget(gi_taxid_prot) run("gunzip -c taxdump.tar.gz | tar xvf -") run("gunzip gi_taxid_nucl.dmp.gz") run("gunzip gi_taxid_prot.dmp.gz") run("cat gi_taxid_nucl.dmp gi_taxid_prot.dmp > gi_taxid_all.dmp") run("sort -n -k 1 gi_taxid_all.dmp > gi_taxid_sorted.txt") run("rm gi_taxid_nucl.dmp gi_taxid_prot.dmp gi_taxid_all.dmp") run("cat names.dmp | sed s/[\\(\\)\\'\\\"]/_/g > names.temporary") run("mv names.dmp names.dmp.orig") run("mv names.temporary names.dmp")
from itertools import groupby from typing import Dict from tqdm import tqdm from gensim.models import KeyedVectors from util import wget, get_word_embedding_model logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') # Corpus URL_CORPUS = 'https://drive.google.com/u/0/uc?id=17EBy4GD4tXl9G4NTjuIuG5ET7wfG4-xa&export=download' PATH_CORPUS = './cache/wikipedia_en_preprocessed.txt' CORPUS_LINE_LEN = 104000000 # 53709029 if not os.path.exists(PATH_CORPUS): logging.info('downloading wikidump') wget(url=URL_CORPUS, cache_dir='./cache', gdrive_filename='wikipedia_en_preprocessed.zip') OVERWRITE_CACHE = False # Stopwords with open('./stopwords_en.txt', 'r') as f: STOPWORD_LIST = list(set(list(filter(len, f.read().split('\n'))))) def get_wiki_vocab(minimum_frequency: int, word_vocabulary_size: int = None): """ Get word distribution over Wikidump (lowercased and tokenized) """ dict_freq = {} bar = tqdm(total=CORPUS_LINE_LEN) with open(PATH_CORPUS, 'r', encoding='utf-8') as corpus_file: for _line in corpus_file: bar.update() tokens = _line.strip().split(" ")
def fetch_download_url(songid, force=False): dlink = '%s%s/download' % (BASE, songid) page = wget(dlink, force) return BASE + parse_download(page)
def set4chan(): hdr = { 'User-Agent' : "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0" } boardslist = json.loads(urllib.request.urlopen(urllib.request.Request("http://api.4chan.org/boards.json", headers=hdr)).read().decode('utf8'))['boards'] numpages = 0 boards = [] for board in boardslist: if hasattr(Settings, "BOARD"): if board['board'] == Settings.BOARD or board['title'] == Settings.BOARD: boards.append(board) continue if hasattr(Settings, "BOARDS"): if board['board'] in Settings.BOARDS or board['title'] in Settings.BOARDS: boards.append(board) continue if len(boards) == 0: boards.append({'board': "w", 'pages': 11}) set = False random.shuffle(boards) for board in boards: pages = list(range(0, board['pages'])) random.shuffle(pages) for page in pages: req = urllib.request.Request("http://api.4chan.org/" + board['board'] + "/" + str(page) + ".json", headers=hdr) page = json.loads(urllib.request.urlopen(req).read().decode('utf8')) threads = copy.copy(page['threads']) random.shuffle(threads) for thread in threads: op = thread['posts'][0] if 'sticky' in op: break req = urllib.request.Request("http://api.4chan.org/" + board['board'] + "/res/" + str(op['no']) + ".json", headers=hdr) posts = json.loads(urllib.request.urlopen(req).read().decode('utf8'))['posts'] random.shuffle(posts) for post in posts: if not ('w' in post and 'h' in post and 'ext' in post and 'tim' in post): break width = int(post['w']) height = int(post['h']) ext = post['ext'] if ext.lower() in exts and width >= Settings.WIDTH and height >= Settings.HEIGHT: fname = str(post['tim']) + ext path = util.wget("http://images.4chan.org/" + board['board'] + "/src/" + fname, Settings.DIRECTORY) set = fromstr(Settings.WALLMANAGER)(path, width, height) if set: print("Set: " + fname + " from /" + board['board'] + "/" + str(op['no']) + ": " + str(width) + "x" + str(height)) break if set: break if set: break if set: break return True
rc = ReplicaCatalog() for entry in os.listdir(str(Path(BASE_DIR / 'inputs'))): infile = File(entry) inputs.append(infile) chksum = sha256(str(BASE_DIR / 'inputs/{}'.format(entry))) pfn = 'http://{}/~{}/inputs/{}'.format(staging_site, username, entry) urls.append(pfn) rc.add_replica('origin', infile, pfn, checksum={"sha256": chksum}) if args.populate: # pre populate the caches for site in ["syr", "unl", "ucsd", "uc"]: proxy = "http://{}-cache:8000".format(site) log.info("populating cache at {}".format(site)) for url in urls: util.wget(url, http_proxy=proxy) for i in range(args.num_jobs): j = Job(script).add_args(i).add_inputs(*inputs) wf.add_jobs(j) wf.add_transformation_catalog(tc) wf.add_replica_catalog(rc) if args.timestamps_file: with open(args.timestamps_file, "w") as f: f.write(datetime.now().strftime('%Y-%m-%dT%H:%M:%S') + " START") # start workflow try: log.info("planning and submitting workflow with {} jobs".format(