def similar_top_opt3(vec, words, topn=200, nthreads=4, freq=None): vec.init_sims() indices = [vec.vocab[w].index for w in words if w in vec.vocab] vecs = vec.syn0norm[indices] dists = np.dot(vecs, vec.syn0norm.T) if freq is not None: dists = dists * np.log(freq) if nthreads == 1: res = dists2neighbours(vec, dists, indices, topn) else: batchsize = int(ceil(1. * len(indices) / nthreads)) print >> stderr, "dists2neighbours for %d words in %d threads, batchsize=%d" % ( len(indices), nthreads, batchsize) def ppp(i): return dists2neighbours(vec, dists[i:i + batchsize], indices[i:i + batchsize], topn) lres = parallel_map(ppp, range(0, len(indices), batchsize), threads=nthreads) res = OrderedDict() for lr in lres: res.update(lr) return res
def similar_top_opt3(vec, words, topn=200, nthreads=4, freq=None): vec.init_sims() indices = [vec.vocab[w].index for w in words if w in vec.vocab] vecs = vec.syn0norm[indices] dists = np.dot(vecs, vec.syn0norm.T) print "Shape before freq: ", dists.shape if freq is not None: print "Using freq weighting" dists = dists * np.log(freq) print "Shape after freq: ", dists.shape if nthreads==1: res = dists2neighbours(vec, dists, indices, topn) else: batchsize = int(ceil(1. * len(indices) / nthreads)) print >> stderr, "dists2neighbours for %d words in %d threads, batchsize=%d" % (len(indices), nthreads, batchsize) def ppp(i): return dists2neighbours(vec, dists[i:i+batchsize], indices[i:i+batchsize], topn) lres = parallel_map(ppp, range(0,len(indices),batchsize), threads=nthreads) res = OrderedDict() for lr in lres: res.update(lr) return res
def similar_top_opt3(wvectors, cvectors, words, topn=200, nthreads=4): wvectors.init_sims() cvectors.init_sims() indices = [wvectors.vocab[w].index for w in words if w in wvectors.vocab] wvecs = wvectors.syn0norm[indices] dists = np.dot(wvecs, cvectors.syn0norm.T) if nthreads==1: res = dists2neighbours(wvectors, cvectors, dists, indices, topn) else: batchsize = int(ceil(1. * len(indices) / nthreads)) print >> stderr, "dists2neighbours for %d words in %d threads, batchsize=%d" % (len(indices), nthreads, batchsize) def ppp(i): return dists2neighbours(wvectors, cvectors, dists[i:i+batchsize], indices[i:i+batchsize], topn) lres = parallel_map(ppp, range(0,len(indices),batchsize), threads=nthreads) res = OrderedDict() for lr in lres: res.update(lr) return res
print(pmid, colored("ya descargado", "yellow")) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("entrada", help="Archivo de entrada", type=open) parser.add_argument("-p", "--parallel", help="Ejecuta en forma paralela", action="store_true") parser.add_argument("--super1", help="Usa el Super Scraper 1 (Sci-hub)", action="store_true") parser.add_argument("--super2", help="Usa el Super Scraper 2 (Libgen)", action="store_true") args = parser.parse_args() downloaded = load_downloaded() in_parallel = args.parallel use_super1 = args.super1 use_super2 = args.super2 f = args.entrada if in_parallel: parallel.parallel_map(process_line, f.readlines()) else: for line in f: process_line(line)
socket.setdefaulttimeout(timeout) def _do_crawl(url, args): try: import urllib2 data = urllib2.urlopen(url).read() return True, data except: return False, None def callback(url, ret, args, data): if ret: fp = args['output'] data = ' '.join(data.split('\n')) print >>fp, url + '\t' + data if __name__ == '__main__': import sys if len(sys.argv) != 4: print 'usage: prog urllist outputfile threadnum' sys.exit(1) urllist = sys.argv[1] outputfile = sys.argv[2] threadnum = int(sys.argv[3]) l = [x.strip() for x in open(urllist)] fp = open(outputfile, 'w') args = {} args['output'] = fp parallel_map(l, _do_crawl, callback, args, threadnum) fp.close()