Beispiel #1
0
def get_analogy_data():
    """ Get SAT-type dataset: a list of (answer: int, prompts: list, stem: list, choice: list)"""
    cache_dir = './cache'
    os.makedirs(cache_dir, exist_ok=True)
    root_url_analogy = 'https://github.com/asahi417/AnalogyTools/releases/download/0.0.0/analogy_test_dataset.tar.gz'
    if not os.path.exists('{}/analogy_test_dataset'.format(cache_dir)):
        wget(root_url_analogy, cache_dir)
    data = {}
    for d in ['bats', 'sat', 'u2', 'u4', 'google']:
        with open('{}/analogy_test_dataset/{}/test.jsonl'.format(cache_dir, d),
                  'r') as f:
            test_set = list(
                filter(
                    None,
                    map(lambda x: json.loads(x) if len(x) > 0 else None,
                        f.read().split('\n'))))
        with open(
                '{}/analogy_test_dataset/{}/valid.jsonl'.format(cache_dir, d),
                'r') as f:
            val_set = list(
                filter(
                    None,
                    map(lambda x: json.loads(x) if len(x) > 0 else None,
                        f.read().split('\n'))))
        data[d] = (val_set, test_set)
    return data
def get_lexical_relation_data():
    """ get dataset """
    cache_dir = 'cache'
    os.makedirs(cache_dir, exist_ok=True)
    root_url_analogy = 'https://github.com/asahi417/AnalogyTools/releases/download/0.0.0/lexical_relation_dataset.tar.gz'
    if not os.path.exists('{}/lexical_relation_dataset'.format(cache_dir)):
        wget(root_url_analogy, cache_dir)
    full_data = {}
    for i in glob('{}/lexical_relation_dataset/*'.format(cache_dir)):
        if not os.path.isdir(i):
            continue
        full_data[os.path.basename(i)] = {}
        label = {}
        for t in glob('{}/*tsv'.format(i)):
            with open(t) as f:
                data = [
                    line.split('\t') for line in f.read().split('\n')
                    if len(line) > 0
                ]
            x = [d[:2] for d in data]
            y = [d[-1] for d in data]
            for _y in y:
                if _y not in label:
                    label[_y] = len(label)
            y = [label[_y] for _y in y]
            full_data[os.path.basename(i)][os.path.basename(t).replace(
                '.tsv', '')] = {
                    'x': x,
                    'y': y
                }
        full_data[os.path.basename(i)]['label'] = label
    return full_data
Beispiel #3
0
def update_bug_data(update_anyway, cache_dir, bug_types,
        bugs_update_period_in_days, verbose=False):
    """Download bug if what's available is too old.

    ``update_anyway'' will download data regardless of how old they are

    ``cache_dir'' where to store the downloaded files

    ``bug_types'' what bug types to download data for (full names)

    ``bugs_update_period_in_days'' update bug data if the previously
        downloaded data is older than this period

    ``verbose'' whether to print diagnostic messages

    """
    assert os.path.isdir(cache_dir)
    # see which bug files are missing or have to be updated, if any
    all_files = ["%s/%s.html" % (cache_dir, bt) for bt in bug_types]
    if update_anyway:
        files_to_update = all_files
    else:
        bugs_mtime_threshold = bugs_update_period_in_days * 86400
        files_to_update = [f for f in all_files \
                if not younger_than(f, bugs_mtime_threshold)]

        if not files_to_update:
            if verbose:
                print "using previously downloaded bug data"
            return

    url_paths = ["/devel/wnpp/%s" % os.path.basename(f) for f in files_to_update]
    wget("www.debian.org", url_paths, cache_dir, verbose)
def get_pair_relative(cache_dir: str = './cache'):
    """ Get the list of word pairs in RELATIVE pretrained model """
    path = '{}/relative-init_wikipedia_en_300d.bin'.format(cache_dir)
    if not os.path.exists(path):
        url = 'https://drive.google.com/u/0/uc?id=1HVJnTjcaQ3aCLdwTZwiGLpMDyEylx-zS&export=download'
        wget(url, cache_dir, gdrive_filename='relative-init_wikipedia_en_300d.bin')
    model = KeyedVectors.load_word2vec_format(path, binary=True)
    return list(map(lambda x: [i.replace('_', ' ') for i in x.split('__')], model.vocab.keys()))
Beispiel #5
0
def getInstallScripts(installDir,version="CURRENT"):
    cwd=os.getcwd()
    os.chdir(installDir)

    url=BASE_URL+version+"/install-scripts.tar.gz"
    info("Fetching install scripts")
    wget(url,"install-scripts.tar.gz",CONSOLE)
    os.chdir(cwd)

    return os.path.lexists( os.path.join(installDir, "install-scripts.tar.gz") )
Beispiel #6
0
def update_popcon_data(update_anyway, cache_dir, popcon_update_period_in_days,
        verbose=False):

    filename = "%s/%s" % (cache_dir, POPCON_FNAME)
    max_age = popcon_update_period_in_days * 86400
    if update_anyway or not younger_than(filename, max_age):
        url_paths = ["/%s.gz" % POPCON_FNAME]
        wget("popcon.debian.org", url_paths, cache_dir, verbose)
        decompress_gzip("%s.gz" % filename)
    elif verbose:
        print "using previously downloaded popcon data"
Beispiel #7
0
 def __call__(self, source, path):
     try:
         url = source['url']
         print("start to download model from http url(%s) to (%s)" %
               (url, path))
         util.wget(path, url)
         print("end to download model from http url(%s) to (%s)" %
               (url, path))
     except Exception as e:
         print(e)
         raise RuntimeError('Invalid url(wget from url fail reason(%s))' %
                            e.message)
Beispiel #8
0
	def get_binary(platform):
		url = ARGS.BIN_URL
		url=url.replace("$PLATFORM",platform)
		
		if (ARGS.is_development):
			url=url.replace("$TYPE","D")
	  		url=url.replace("CURRENT","Development")
		else:
			url=url.replace("$TYPE",ARGS.NEWEST_VERSION)
			
		print_console( "Downloading binaries for platform \"" + platform  + "\" to " + ARGS.install_dir + " from " + url)
 		wget(url, platform+".tar.gz",CONSOLE)
Beispiel #9
0
 def get_binary(platform):
     url = BIN_URL
     url=url.replace("$PLATFORM",platform)
     
     if (is_development):
         url=url.replace("$TYPE","D")
         url=url.replace("CURRENT","Development")
     else:
         from detect import detect_latest_version
         url=url.replace("$TYPE",detect_latest_version())
         
     info( "Downloading binaries for platform \"" + platform  + "\" to " + installDir + " from " + url)
     wget(url, platform+".tar.gz",CONSOLE)
Beispiel #10
0
def download_stock(args):
    stock, output_dir = args
    from_date = datetime.date(1900, 1, 1)
    to_date = datetime.date(2099, 1, 1)
    url = DATA_BASE_URL.format(stock,
                               to_date.month - 1,
                               to_date.day,
                               to_date.year,
                               from_date.month - 1,
                               from_date.day,
                               from_date.year)
    print(url)
    output_path = os.path.join(output_dir, stock + '.csv')
    wget(url, output_path)
Beispiel #11
0
def get_albums(ting_uid):
    whole = []
    for start in itertools.count(step=10):
        url = URL % (start, ting_uid)
        page = wget(url)
        albums = parse(page)
        if not albums:
            break
        whole.extend(albums)
    return whole
Beispiel #12
0
def get_framework():
		
		url=None
		
		os.chdir(ARGS.install_dir)
		if(ARGS.isGitDev):
			ARGS.GITDEV_HASHMAP["h"]=ARGS.selectedTag[1]
			params=urllib.urlencode(ARGS.GITDEV_HASHMAP)
			ARGS.FRAMEWORK_URL=ARGS.GITDEV_URL+"%s"%params
			url=ARGS.FRAMEWORK_URL
			print_console("Using Git: Downloading development framework to " + ARGS.install_dir + " from " + url)
			print_console("Please wait... a fresh archive is being cooked up...")
		else:
	  		
			if (not ARGS.is_mini and not ARGS.is_development):
				
				url=ARGS.FRAMEWORK_URL
				url=url.replace("$TYPE",ARGS.NEWEST_VERSION)
				print_console( "Downloading birch framework to " + ARGS.install_dir + " from " + url)
				
			elif (not ARGS.is_development and ARGS.is_mini):
				
				url=ARGS.MINI_URL
				url=url.replace("$TYPE",ARGS.NEWEST_VERSION)
				print_console( "Downloading miniBirch framework to " + ARGS.install_dir + " from " + url)
			
			elif(ARGS.is_development and not ARGS.is_mini):
				
				url=ARGS.FRAMEWORK_URL
				url=url.replace("$TYPE","D")
				url=url.replace("CURRENT","Development")
				print_console( "Downloading development framework to " + ARGS.install_dir + " from " + url)
				
			elif(ARGS.is_development and ARGS.is_mini):
				url=ARGS.MINI_URL
				url=url.replace("$TYPE","D")
				url=url.replace("CURRENT","Development")
				print_console( "Downloading miniBirch development framework to " + ARGS.install_dir + " from " + url)
		
	  		
		wget(url, "framework.tar.gz",CONSOLE)
Beispiel #13
0
def fetch_album(url):
    page = wget(url)
    alb = parse_album(page)

    path = os.path.join(esc(alb['singer']), esc(alb['name']))
    alb['path'] = path

    for link in alb['links']:
        fname = '%s_%s.mp3' % (link['idx'], esc(link['title']))
        link['fname'] = fname

    alb['links'].append({'url': alb['cover']})
    return alb
Beispiel #14
0
def getFramework(installDir,is_development):
        
        url=None
        
        os.chdir(installDir)

        
        if (not is_development):
            from detect import detect_latest_version
            
            url=FRAMEWORK_URL
            url=url.replace("$TYPE",detect_latest_version())
            info("Downloading birch framework to " + installDir + " from " + url)
            
       # elif (not is_development and is_mini):
       #     
       #     url=ARGS.MINI_URL
       #     url=url.replace("$TYPE",ARGS.NEWEST_VERSION)
       #     info( "Downloading miniBirch framework to " + installDir + " from " + url)
        
        elif(is_development ):
            
            url=FRAMEWORK_URL
            url=url.replace("$TYPE","D")
            url=url.replace("CURRENT","Development")
            info( "Downloading development framework to " + installDir + " from " + url)
            
       # elif(is_development and is_mini):
       #     url=ARGS.MINI_URL
       #     url=url.replace("$TYPE","D")
       #     url=url.replace("CURRENT","Development")
       #     info( "Downloading miniBirch development framework to " + installDir + " from " + url)
    
        wget(url, "framework.tar.gz",CONSOLE)

        return os.path.lexists( os.path.join(installDir, "framework.tar.gz") )
Beispiel #15
0
def setup_taxonomy_data():
    taxonomy_directory = os.path.join(env.data_files, "taxonomy")
    env.safe_sudo("mkdir -p '%s'" % taxonomy_directory, user=env.user)
    with cd(taxonomy_directory):
        taxonomy_url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
        gi_taxid_nucl = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz"
        gi_taxid_prot = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz"
        wget(taxonomy_url)
        wget(gi_taxid_nucl)
        wget(gi_taxid_prot)
        run("gunzip -c taxdump.tar.gz | tar xvf -")
        run("gunzip gi_taxid_nucl.dmp.gz")
        run("gunzip gi_taxid_prot.dmp.gz")
        run("cat gi_taxid_nucl.dmp gi_taxid_prot.dmp > gi_taxid_all.dmp")
        run("sort -n -k 1 gi_taxid_all.dmp > gi_taxid_sorted.txt")
        run("rm gi_taxid_nucl.dmp gi_taxid_prot.dmp gi_taxid_all.dmp")
        run("cat names.dmp | sed s/[\\(\\)\\'\\\"]/_/g > names.temporary")
        run("mv names.dmp names.dmp.orig")
        run("mv names.temporary names.dmp")
Beispiel #16
0
def setup_taxonomy_data():
    taxonomy_directory = os.path.join(env.data_files, "taxonomy")
    env.safe_sudo("mkdir -p '%s'" % taxonomy_directory, user=env.user)
    with cd(taxonomy_directory):
        taxonomy_url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
        gi_taxid_nucl = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz"
        gi_taxid_prot = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz"
        wget(taxonomy_url)
        wget(gi_taxid_nucl)
        wget(gi_taxid_prot)
        run("gunzip -c taxdump.tar.gz | tar xvf -")
        run("gunzip gi_taxid_nucl.dmp.gz")
        run("gunzip gi_taxid_prot.dmp.gz")
        run("cat gi_taxid_nucl.dmp gi_taxid_prot.dmp > gi_taxid_all.dmp")
        run("sort -n -k 1 gi_taxid_all.dmp > gi_taxid_sorted.txt")
        run("rm gi_taxid_nucl.dmp gi_taxid_prot.dmp gi_taxid_all.dmp")
        run("cat names.dmp | sed s/[\\(\\)\\'\\\"]/_/g > names.temporary")
        run("mv names.dmp names.dmp.orig")
        run("mv names.temporary names.dmp")
Beispiel #17
0
from itertools import groupby
from typing import Dict
from tqdm import tqdm

from gensim.models import KeyedVectors
from util import wget, get_word_embedding_model

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')

# Corpus
URL_CORPUS = 'https://drive.google.com/u/0/uc?id=17EBy4GD4tXl9G4NTjuIuG5ET7wfG4-xa&export=download'
PATH_CORPUS = './cache/wikipedia_en_preprocessed.txt'
CORPUS_LINE_LEN = 104000000  # 53709029
if not os.path.exists(PATH_CORPUS):
    logging.info('downloading wikidump')
    wget(url=URL_CORPUS, cache_dir='./cache', gdrive_filename='wikipedia_en_preprocessed.zip')
OVERWRITE_CACHE = False

# Stopwords
with open('./stopwords_en.txt', 'r') as f:
    STOPWORD_LIST = list(set(list(filter(len, f.read().split('\n')))))


def get_wiki_vocab(minimum_frequency: int, word_vocabulary_size: int = None):
    """ Get word distribution over Wikidump (lowercased and tokenized) """
    dict_freq = {}
    bar = tqdm(total=CORPUS_LINE_LEN)
    with open(PATH_CORPUS, 'r', encoding='utf-8') as corpus_file:
        for _line in corpus_file:
            bar.update()
            tokens = _line.strip().split(" ")
Beispiel #18
0
def fetch_download_url(songid, force=False):
    dlink = '%s%s/download' % (BASE, songid)
    page = wget(dlink, force)
    return BASE + parse_download(page)
Beispiel #19
0
def set4chan():
	hdr = { 'User-Agent' : "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0" }

	boardslist = json.loads(urllib.request.urlopen(urllib.request.Request("http://api.4chan.org/boards.json", headers=hdr)).read().decode('utf8'))['boards']
	numpages = 0
	boards = []
	for board in boardslist:
		if hasattr(Settings, "BOARD"):
			if board['board'] == Settings.BOARD or board['title'] == Settings.BOARD:
				boards.append(board)
				continue
		if hasattr(Settings, "BOARDS"):
			if board['board'] in Settings.BOARDS or board['title'] in Settings.BOARDS:
				boards.append(board)
				continue


	if len(boards) == 0:
		boards.append({'board': "w", 'pages': 11})

	set = False

	random.shuffle(boards)

	for board in boards:
		pages = list(range(0, board['pages']))
		random.shuffle(pages)

		for page in pages:
			req = urllib.request.Request("http://api.4chan.org/" + board['board'] + "/" + str(page) + ".json", headers=hdr)
			page = json.loads(urllib.request.urlopen(req).read().decode('utf8'))

			threads = copy.copy(page['threads'])
			random.shuffle(threads)

			for thread in threads:
				op = thread['posts'][0]
				if 'sticky' in op:
					break
				req = urllib.request.Request("http://api.4chan.org/" + board['board'] + "/res/" + str(op['no']) + ".json", headers=hdr)
				posts = json.loads(urllib.request.urlopen(req).read().decode('utf8'))['posts']
				random.shuffle(posts)

				for post in posts:
					if not ('w' in post and 'h' in post and 'ext' in post and 'tim' in post):
						break
					width = int(post['w'])
					height = int(post['h'])
					ext = post['ext']
					if ext.lower() in exts and width >= Settings.WIDTH and height >= Settings.HEIGHT:
						fname = str(post['tim']) + ext
						path = util.wget("http://images.4chan.org/" + board['board'] + "/src/" + fname, Settings.DIRECTORY)
						set = fromstr(Settings.WALLMANAGER)(path, width, height)

						if set:
							print("Set: " + fname + " from /" + board['board'] + "/" + str(op['no'])  + ": " + str(width) + "x" + str(height))
							break

				if set:
					break

			if set:
				break

		if set:
			break

	return True
Beispiel #20
0
    rc = ReplicaCatalog()
    for entry in os.listdir(str(Path(BASE_DIR / 'inputs'))):
        infile = File(entry)
        inputs.append(infile)
        chksum = sha256(str(BASE_DIR / 'inputs/{}'.format(entry)))
        pfn = 'http://{}/~{}/inputs/{}'.format(staging_site, username, entry)
        urls.append(pfn)
        rc.add_replica('origin', infile, pfn, checksum={"sha256": chksum})

    if args.populate:
        # pre populate the caches
        for site in ["syr", "unl", "ucsd", "uc"]:
            proxy = "http://{}-cache:8000".format(site)
            log.info("populating cache at {}".format(site))
            for url in urls:
                util.wget(url, http_proxy=proxy)

    for i in range(args.num_jobs):
        j = Job(script).add_args(i).add_inputs(*inputs)
        wf.add_jobs(j)

    wf.add_transformation_catalog(tc)
    wf.add_replica_catalog(rc)

    if args.timestamps_file:
        with open(args.timestamps_file, "w") as f:
            f.write(datetime.now().strftime('%Y-%m-%dT%H:%M:%S') + " START")

    # start workflow
    try:
        log.info("planning and submitting workflow with {} jobs".format(