def main(proc_num, queue, out_dir, download_dir, context_size): print proc_num, "Start loop" while True: if queue.empty(): break name = queue.get() loc_dir = out_dir + "/" + name + "/" ioutils.mkdir(loc_dir) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) time.sleep(120 * random.random()) with open(download_dir + name) as f: for i, l in enumerate(f): split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue ngram = [indexing.word_to_id(word.split("_")[0], index) for word in split[0].split()] year = split[1] count = int(split[2]) if context_size == 2: year_counters = update_count(ngram, 2, year, count, year_counters) elif context_size == 4: year_counters = update_count(ngram, 0, year, count, year_counters) year_counters = update_count(ngram, 4, year, count, year_counters) else: raise Exception("Unsupported context size") print proc_num, "Writing", name time.sleep(120 * random.random()) sparse_io.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl")
def split_main(proc_num, queue, download_dir): print(proc_num, "Start loop") while True: if queue.empty(): break url = queue.get() name = re.search('%s-(.*).gz' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in [file.split("-")[0] for file in dirs]: continue print(proc_num, "Name", name) loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) print(proc_num, "Downloading", name) success = False while not success: with open(loc_dir + name + '.gz', 'w') as f: try: f.write(urllib.request.urlopen(url, timeout=60).read()) success = True except: print("Fail!!") continue print(proc_num, "Unzipping", name) subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d']) print(proc_num, "Splitting", name) subprocess.call([ "split", "-l", str(LINE_SPLIT), loc_dir + name, download_dir + "/" + name + "-" ]) os.remove(loc_dir + name) os.rmdir(loc_dir)
def split_main(proc_num, queue, download_dir): print proc_num, "Start loop" while True: if queue.empty(): break url = queue.get() name = re.search('%s-(.*).gz' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in [file.split("-")[0] for file in dirs]: continue print proc_num, "Name", name loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) print proc_num, "Downloading", name success = False while not success: with open(loc_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: print "Fail!!" continue print proc_num, "Unzipping", name subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d']) print proc_num, "Splitting", name subprocess.call(["split", "-l", str(LINE_SPLIT), loc_dir + name, download_dir + "/" + name + "-"]) os.remove(loc_dir + name) os.rmdir(loc_dir)
def worker(proc_num, queue, dir, count_dir, min_count, checkpoints): while True: if queue.empty(): break year = queue.get() freqs = load_pickle(count_dir + str(year) + "-counts.pkl") for n in checkpoints: out_dir =dir + '{:03d}'.format(n) + "/" mkdir(out_dir) subprocess.call(['mv', dir + str(year) + '-w.' + '{:03d}'.format(n), out_dir + str(year) + '-w']) print "Loading data..", year, "iterations", n text2numpy(out_dir, freqs, year)
def run_parallel(num_procs, out_dir, in_dir, years): ioutils.mkdir(out_dir) lock = Lock() procs = [ Process(target=main, args=[i, lock, out_dir + "/", in_dir + "/", years]) for i in range(num_procs) ] for p in procs: p.start() for p in procs: p.join()
def run_parallel(num_processes, root_dir, source, context_size): queue = Queue() download_dir = root_dir + '/' + source + '/raw/' out_dir = root_dir + '/' + source + '/c' + str(context_size) + '/raw/' ioutils.mkdir(out_dir) for name in os.listdir(download_dir): queue.put(name) procs = [Process(target=main, args=[i, queue, out_dir, download_dir, context_size]) for i in range(num_processes)] for p in procs: p.start() for p in procs: p.join()
def run_parallel(num_processes, in_dir, out_dir): queue = Queue() ioutils.mkdir(out_dir) for zipped_file in os.listdir(in_dir): if not os.path.isfile(in_dir + "/" + zipped_file) or not zipped_file.endswith((".gz")): continue queue.put(zipped_file) procs = [Process(target=split_main, args=[i, queue, in_dir, out_dir]) for i in range(num_processes)] for p in procs: p.start() for p in procs: p.join()
def run_parallel(num_processes, root_dir, out_dir, context_size, is_zipped): queue = Queue() download_dir = root_dir + '/' out_dir = out_dir + '/c' + str(context_size) + '/raw/' ioutils.mkdir(out_dir) for name in os.listdir(download_dir): if name == ".DS_Store": continue queue.put(name) procs = [Process(target=main, args=[i, queue, out_dir, download_dir, context_size, is_zipped]) for i in range(num_processes)] for p in procs: p.start() for p in procs: p.join()
def split_main(proc_num, queue, download_dir, out_dir_g, context_size): print proc_num, "Start loop" while True: # iterates throug the urls of the datafiles if queue.empty(): break url = queue.get() name = re.search('%s-(.*).gz' % VERSION, url).group(1) # gets the name of the individual file dirs = set(os.listdir(download_dir)) if name in [file.split("-")[0] for file in dirs]: continue print proc_num, "Name", name # creates the directiory for the downloading file loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) print proc_num, "Downloading", name # downloads the actual compressed file success = False while not success: with open(loc_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: print "Fail!!" continue print proc_num, "Unzipping", name # Unzips and put the datafile into max LINE_SPLIT line long files subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d']) print proc_num, "Splitting", name subprocess.call([ "split", "-l", str(LINE_SPLIT), loc_dir + name, download_dir + "/" + name + "-" ]) os.remove(loc_dir + name) os.rmdir(loc_dir) # runs gramgrab print proc_num, "gram_grab", name queue_g = Queue() for item in os.listdir(download_dir + "/"): if name + "-" in item: queue_g.put(item) time.sleep(0.01) # It is needed to eliminate unstability main(proc_num, queue_g, out_dir_g, download_dir, context_size) for item in os.listdir(download_dir + "/"): if name + "-" in item: os.remove(download_dir + "/" + item)
def run_parallel(num_processes, out_dir, source): page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page queue = Queue() for url in urls: queue.put(url) ioutils.mkdir(out_dir + '/' + source + '/raw') download_dir = out_dir + '/' + source + '/raw/' ioutils.mkdir(download_dir) procs = [Process(target=split_main, args=[i, queue, download_dir]) for i in range(num_processes)] for p in procs: p.start() for p in procs: p.join()
def main(proc_num, queue, out_dir, download_dir, context_size): print proc_num, "Start loop" while True: # Iterates through the downloaded ngram files if queue.empty(): break name = queue.get() loc_dir = out_dir + "/" + name + "/" ioutils.mkdir( loc_dir ) # Creates a directory for each downloaded file where the yearly co-occurrence matrices will be putted print proc_num, "Going through", name index = collections.OrderedDict( ) # index: bijection between words and integers (integers will be the indeces of the co-occurence matrix) year_counters = collections.defaultdict( collections.Counter ) # year_conter: for every year it contains the co-occurrence matrix as a counter. Counter is used as word-pair - occurrence pairs # time.sleep(120 * random.random()) # Sometimes it is needed to eliminate unstability with open(download_dir + name) as f: for i, l in enumerate( f): # Iterates through the individual ngram file split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue ngram = [ indexing.word_to_id(word.split("_")[0], index) for word in split[0].split() ] # Lists the indices of the words in the ngram year = split[1] count = int(split[2]) # Modifies the co-occurrence matrix with the new informations if context_size == 2: year_counters = update_count(ngram, 2, year, count, year_counters) elif context_size == 4: year_counters = update_count(ngram, 0, year, count, year_counters) year_counters = update_count(ngram, 4, year, count, year_counters) else: raise Exception("Unsupported context size") print proc_num, "Writing", name # Writes the yearly co-occurrence matrices into .bin files #time.sleep(120 * random.random()) # Sometimes it is needed to eliminate unstability for year, counter in year_counters.iteritems(): sparse_io.export_mat_from_dict(counter, loc_dir + year + ".bin") ioutils.write_pickle(index, loc_dir + "index.pkl") # Saves the index
def run_parallel(num_processes, root_dir, source, context_size): queue = Queue() download_dir = root_dir + '/' + source + '/raw/' out_dir = root_dir + '/' + source + '/c' + str(context_size) + '/raw/' ioutils.mkdir(out_dir) for name in os.listdir(download_dir): queue.put(name) procs = [ Process(target=main, args=[i, queue, out_dir, download_dir, context_size]) for i in range(num_processes) ] for p in procs: p.start() for p in procs: p.join()
def run(out_dir, in_dir, years, language): index = collections.OrderedDict() for year in years: # Iterates through the year print "Merging year", year year_list = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") i = 0 for i in xrange(len(year_list) ): # Iterates through the words in the individual year word = year_list[i] stop_set = set(stopwords.words(language)) if word.isalpha() and not word in stop_set: indexing.word_to_cached_id( word, index) # Put every word in the common index ioutils.mkdir(out_dir) ioutils.write_pickle(index, out_dir + "merged_index.pkl") ioutils.write_pickle(list(index), out_dir + "merged_list.pkl")
def run_parallel(num_processes, out_dir, source): ioutils.mkdir(out_dir) ioutils.mkdir(out_dir + '/' + source) ioutils.mkdir(out_dir + '/' + source + '/' + VERSION) download_dir = out_dir + '/' + source + '/' + VERSION + '/' + TYPE + '/' ioutils.mkdir(download_dir) lock = Lock() procs = [Process(target=main, args=[i, lock, download_dir, source]) for i in range(num_processes)] for p in procs: p.start() for p in procs: p.join()
def main(proc_num, queue, out_dir, download_dir, context_size, is_zipped): print proc_num, "Start loop" while True: if queue.empty(): break name = queue.get() if is_zipped: if not name.endswith((".gz")): continue print "Unzipping " + name + " ..." subprocess.call(['gunzip', '-f', download_dir + name, '-d']) name = name.split(".gz")[0] loc_dir = out_dir + "/" + name + "/" ioutils.mkdir(loc_dir) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) time.sleep(120 * random.random()) with open(download_dir + name) as f: for i, l in enumerate(f): split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue ngram = [indexing.word_to_id(word.split("_")[0], index) for word in split[0].split()] year = split[1] count = int(split[2]) if context_size == 2: year_counters = update_count(ngram, 2, year, count, year_counters) elif context_size == 4: year_counters = update_count(ngram, 0, year, count, year_counters) year_counters = update_count(ngram, 4, year, count, year_counters) else: raise Exception("Unsupported context size") print proc_num, "Writing", name time.sleep(120 * random.random()) sparse_io_ref.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl") os.remove(download_dir + name)
def main(proc_num, queue, out_dir, in_dir, context_size): ioutils.mkdir(out_dir) print proc_num, "Start loop" while True: # Iterates through the years try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "- Loading mat for year", year year_mat = load_matrix(in_dir + str(year) + ".bin") index = ioutils.load_pickle(in_dir + str(year) + "-index.pkl") print proc_num, "- Processing data for year", year counts = year_mat.sum(1) / (2 * context_size) # sums up the occurrence counts = { word: int(counts[index[word]]) for word in index if index[word] < len(counts) } ioutils.write_pickle(counts, out_dir + "/" + str(year) + "-counts.pkl") # writes it in a file
def run_parallel(num_processes, out_dir, source): page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") ioutils.mkdir(out_dir) ioutils.mkdir(out_dir + '/' + source) ioutils.mkdir(out_dir + '/' + source + '/' + VERSION) download_dir = out_dir + '/' + source + '/' + VERSION + '/' + TYPE + '/' ioutils.mkdir(download_dir) lock = Lock() procs = [Process(target=main, args=[i, lock, page, download_dir, source]) for i in range(num_processes)] for p in procs: p.start() for p in procs: p.join()
def run_parallel(num_processes, out_dir, source): page = requests.get( "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page queue = Queue() for url in urls: queue.put(url) ioutils.mkdir(out_dir + '/' + source + '/raw') download_dir = out_dir + '/' + source + '/raw/' ioutils.mkdir(download_dir) procs = [ Process(target=split_main, args=[i, queue, download_dir]) for i in range(num_processes) ] for p in procs: p.start() for p in procs: p.join()
def run_parallel(num_processes, out_dir, source, context_size): page = requests.get( "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html" ) # gets the URL addresses from the code of the webpage of database pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page queue = Queue() for url in urls: # puts the urls into a queue queue.put(url) ioutils.mkdir(out_dir + '/' + source + '/raw') download_dir = out_dir + '/' + source + '/raw/' out_dir_g = out_dir + '/' + source + '/c' + str(context_size) + '/raw/' ioutils.mkdir(download_dir) procs = [ Process(target=split_main, args=[i, queue, download_dir, out_dir_g, context_size]) for i in range(num_processes) ] for p in procs: p.start() for p in procs: p.join()
row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat) old_index = list(ioutils.load_pickle(in_dir + str(year) + "-index.pkl")) new_index = collections.OrderedDict() for i in xrange(len(keep_rows)): new_index[old_index[keep_rows[i]]] = i ioutils.write_pickle(new_index, in_dir + "/second/" + str(year) + "-index.pkl") print proc_num, "Writing counts for year", year matstore.export_mat_eff(row_d, col_d, data_d, year, in_dir + "/second/") def run_parallel(num_procs, in_dir, years): queue = Queue() random.shuffle(years) for year in years: queue.put(year) procs = [Process(target=worker, args=[i, queue, in_dir, years]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() if __name__ == '__worker__': parser = argparse.ArgumentParser(description="Makes and stores second order matrices from first order PPMI data..") parser.add_argument("in_dir", help="path to first order data") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) ioutils.mkdir(args.in_dir + "/second") run_parallel(args.num_procs, args.in_dir + "/", years)
if not work_left: print proc_num, "Finished" break print proc_num, "Making knn net for year", year old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") row_d, col_d, data_d = make_knn_mat(old_mat, k) print proc_num, "Writing counts for year", year matstore.export_mat_eff(row_d, col_d, data_d, year, tmp_pref) def run_parallel(num_procs, in_dir, years, k): lock = Lock() procs = [Process(target=main, args=[i, lock, in_dir, years, k]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.") parser.add_argument("in_dir", help="path to unmerged data") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) parser.add_argument("--k", type=int, help="k nn thresh", default=K) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) ioutils.mkdir(args.in_dir + "/dknn-" + str(args.k)) run_parallel(args.num_procs, args.in_dir + "/", years, args.k)
def main(in_dir, out_dir, years): name_list = [] print "Start loop" for zipped_file in os.listdir(in_dir): if not os.path.isfile(in_dir + "/" + zipped_file) or not zipped_file.endswith((".gz")): continue tmp_year_counts = {} tmp_year_doc_counts = {} tmp_year_pos = {} for year in years: tmp_year_counts[year] = {} tmp_year_doc_counts[year] = {} tmp_year_pos[year] = {} name = zipped_file.split(".gz")[0] name_list.append(name) print "Unzipping", name subprocess.call(['gunzip', '-f', in_dir + "/" + name + '.gz', '-d']) print "Going through", name with open(in_dir + "/" + name) as f: for l in f: try: split = l.strip().split('\t') if not POS.match(split[0]): continue count = int(split[2]) if count < 10: continue word_info = split[0].split("_") pos = word_info[-1] word = word_info[0].decode('utf-8').lower() word = word.strip("\"") word = word.split("\'s")[0] year = int(split[1]) doc_count = int(split[3]) if not year in years: continue if not word in tmp_year_counts[year]: tmp_year_counts[year][word] = 0 tmp_year_doc_counts[year][word] = 0 tmp_year_pos[year][word] = collections.Counter() tmp_year_counts[year][word] += count tmp_year_doc_counts[year][word] += doc_count tmp_year_pos[year][word][pos] += count except UnicodeDecodeError: pass print "Writing tmp " + name ioutils.mkdir(out_dir + "/" + name) for year in years: ioutils.write_pickle(tmp_year_counts[year], out_dir + "/" + name + "/" + str(year) + "-counts.pkl") ioutils.write_pickle(tmp_year_doc_counts[year], out_dir + "/" + name + "/" + str(year) + "-doc_counts.pkl") ioutils.write_pickle(tmp_year_pos[year], out_dir + "/" + name + "/" + str(year) + "-pos.pkl") print "Deleting", name try: os.remove(in_dir + "/" + name) os.remove(in_dir + "/"+ name + '.gz') except: pass print "Merging..." merge_year_counts(out_dir, name_list, years)
def main(proc_num, lock, download_dir, source): page = requests.get( "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.csv.zip)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page print proc_num, "Start loop" while True: lock.acquire() work_left = False for url in urls: name = re.search('%s-(.*).csv.zip' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in dirs: continue work_left = True print proc_num, "Name", name loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Downloading", name success = False while not success: with open(loc_dir + name + '.csv.zip', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print proc_num, "Unzipping", name subprocess.call( ['unzip', '-o', loc_dir + name + '.csv.zip', '-d', loc_dir]) subprocess.call([ 'mv', loc_dir + 'googlebooks-' + source + '-' + TYPE + '-' + VERSION + '-' + name + '.csv', loc_dir + name ]) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) n = 0 with open(loc_dir + name) as f: for l in f: split = l.strip().split('\t') try: ngram = split[0].split() middle_index = len(ngram) // 2 item = ngram[middle_index] context = ngram[:middle_index] + ngram[middle_index + 1:] item_id = indexing.word_to_id(item, index) year = split[1] count = int(split[2]) for context_word in context: pair = (item_id, indexing.word_to_id(context_word, index)) year_counters[year][pair] += count except: pass print proc_num, "Writing", name, n matstore.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl") print proc_num, "Deleting", name try: os.remove(loc_dir + name) os.remove(loc_dir + name + '.csv.zip') except: pass
export_mat_from_dict(pair_counts, decade, OUT.format(type=type, window_size=window_size)) def _process_context(context, pair_counts, window_size): if len(context) < window_size + 1: return pair_counts target = context[window_size] indices = range(0, window_size) indices.extend(range(window_size + 1, 2 * window_size + 1)) for i in indices: if i >= len(context): break pair_counts[(target, context[i])] += 1 return pair_counts if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("type") parser.add_argument("window_size", type=int) parser.add_argument("--workers", type=int, default=25) args = parser.parse_args() mkdir(OUT.format(type=args.type, window_size=args.window_size)) queue = Queue() for decade in range(1810, 2010, 10): queue.put(decade) id_map = load_pickle(DICT.format(type=args.type)) procs = [Process(target=worker, args=[i, queue, args.window_size, args.type, id_map]) for i in range(args.workers)] for p in procs: p.start() for p in procs: p.join()
if __name__ == '__main__': parser = argparse.ArgumentParser( description="Computes various frequency statistics.") parser.add_argument("out_dir") parser.add_argument("in_dir") parser.add_argument("count_dir") parser.add_argument("wordlist") parser.add_argument("--num_sam", type=int, default=10000000) parser.add_argument("--workers", type=int, default=10) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000) parser.add_argument("--year-inc", type=int, help="end year (inclusive)", default=1) parser.add_argument("--sample", type=float, default=1e-5) args = parser.parse_args() years = range(args.start_year, args.end_year + 1, args.year_inc) ioutils.mkdir(args.out_dir) wordlist = ioutils.load_pickle(args.wordlist) run_parallel(args.workers, args.out_dir + "/", args.in_dir + "/", years, wordlist, args.count_dir, args.num_sam, args.sample)
years = year_index_infos.keys() random.shuffle(years) for year in years: queue.put(year) procs = [Process(target=worker, args=[i, queue, out_pref, in_dir, year_index_infos, knn, thresh]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() merge(out_pref, years, get_full_word_list(year_index_infos)) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes network statistics for second order data.") parser.add_argument("dir", help="path to directory with co-occurrence data and index") parser.add_argument("word_file", help="path to sorted word file(s).", default=None) parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.", default=-1) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1900) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=2000) parser.add_argument("--thresh", type=float, help="optional threshold", default=0) parser.add_argument("--knn", type=int, help="optional number of nearest neighbours", default=None) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) year_index_infos = ioutils.load_year_index_infos(args.dir, years, args.word_file, num_words=args.num_words) outpref = args.dir + "/secondnetstats-" + str(args.thresh) + "-" + str(args.knn) + "/" ioutils.mkdir(outpref) outpref += args.word_file.split("/")[-1].split(".")[0] if args.num_words != -1: outpref += "-top" + str(args.num_words) run_parallel(args.num_procs, outpref, args.dir + "/", year_index_infos, knn=args.knn, thresh=args.thresh)
def main(proc_num, lock, page, download_dir, source): pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page print proc_num, "Start loop" while True: lock.acquire() work_left = False for url in urls: if EXCLUDE_PATTERN.match(url): continue name = re.search('%s-(.*).gz' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in dirs: continue work_left = True print proc_num, "Name", name loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Downloading", name success = False while not success: with open(loc_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print proc_num, "Unzipping", name subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d']) # subprocess.call(['mv', loc_dir + 'googlebooks-' + source + '-' + TYPE + '-' + VERSION + '-' + name + '.csv', loc_dir + name]) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) skipped = 0 with open(loc_dir + name) as f: for l in f: split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue try: ngram = split[0].split() middle_index = len(ngram) // 2 item = ngram[middle_index] context = ngram[:middle_index] + ngram[middle_index + 1:] item_id = indexing.word_to_id(item, index) year = split[1] count = int(split[2]) for context_word in context: pair = (item_id, indexing.word_to_id(context_word, index)) year_counters[year][pair] += count except: skipped += 1 pass print proc_num, "Writing", name, "Skipped", skipped matstore.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl") print proc_num, "Deleting", name try: os.remove(loc_dir + name) os.remove(loc_dir + name + '.gz') except: pass
procs = [ Process(target=worker, args=[i, queue, in_dir, years]) for i in range(num_procs) ] for p in procs: p.start() for p in procs: p.join() if __name__ == '__worker__': parser = argparse.ArgumentParser( description= "Makes and stores second order matrices from first order PPMI data..") parser.add_argument("in_dir", help="path to first order data") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) ioutils.mkdir(args.in_dir + "/second") run_parallel(args.num_procs, args.in_dir + "/", years)
new_row = get_index(merged_index, year_list, mat.row[i]) new_col = get_index(merged_index, year_list, mat.col[i]) counts[(new_row, new_col)] += mat.data[i] print "Done year ", decade + year export_mat_from_dict(counts, decade, out_dir) write_pickle(merged_index, out_dir + str(decade) + "-index.pkl") write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl") def run_parallel(num_procs, out_dir, in_dir, decades): queue = Queue() for decade in decades: queue.put(decade) procs = [Process(target=worker, args=[i, queue, out_dir, in_dir]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.") parser.add_argument("out_dir", help="path to network data (also where output goes)") parser.add_argument("in_dir", help="path to network data (also where output goes)") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)") parser.add_argument("--end-year", type=int, help="end year (inclusive)") args = parser.parse_args() decades = range(args.start_year, args.end_year + 1, 10) decades.reverse() mkdir(args.out_dir) run_parallel(args.num_procs, args.out_dir + "/", args.in_dir + "/", decades)
if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("dir") parser.add_argument("rep_type") parser.add_argument("num_emb", help="number of different embedding-series", type=int) parser.add_argument("wordlist") parser.add_argument("--start-year", type=int, default=1800) parser.add_argument("--end-year", type=int, default=2000) parser.add_argument("--year-inc", type=int, default=1) parser.add_argument("--dim", type=int, default=300) args = parser.parse_args() kwargs = dict() if (args.rep_type.lower() == "sgns"): kwargs["normalize"] = False wordlist = load_pickle(args.wordlist) years = range(args.start_year, args.end_year + 1, args.year_inc) mkdir(args.dir + "/embedding_avg/") mkdir(args.dir + "/embedding_avg/aligned") for i in range(1, args.num_emb + 1): mkdir(args.dir + "/embedding_" + str(i) + "/noinit/" + str(args.dim) + "/aligned/") for year in years: align_cloud(year, args.rep_type, args.dir, args.num_emb, args.dim, wordlist, **kwargs) align_years(years, args.rep_type, args.dir, args.num_emb, args.dim, **kwargs)
parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1900) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=2000) parser.add_argument("--year-inc", type=int, help="year increment", default=1) parser.add_argument("--thresh", type=float, help="optional threshold", default=None) args = parser.parse_args() years = list(range(args.start_year, args.end_year + 1, args.year_inc)) year_index_infos = ioutils.load_year_index_infos(args.dir, years, args.word_file, num_words=args.num_words) outpref = "/netstats/" + args.word_file.split("/")[-1].split(".")[0] if args.num_words != -1: outpref += "-top" + str(args.num_words) if args.thresh != None: outpref += "-" + str(args.thresh) ioutils.mkdir(args.dir + "/netstats") run_parallel(args.num_procs, args.dir + outpref, args.dir + "/", year_index_infos, args.thresh)
for p in procs: p.join() print "Merging" full_word_set = set([]) for year_words in word_list.itervalues(): full_word_set = full_word_set.union(set(year_words)) merge(out_pref, years, list(full_word_set)) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes semantic change statistics for words.") parser.add_argument("dir", help="path to network data (also where output goes)") parser.add_argument("word_file", help="path to sorted word file") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include", default=-1) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) parser.add_argument("--disp-year", type=int, help="year to measure displacement from", default=END_YEAR) parser.add_argument("--thresh", type=float, help="relevance threshold", default=THRESH) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) word_lists = ioutils.load_year_words(args.word_file, years) if args.num_words != -1: for year in years: word_lists[year] = word_lists[year][:args.num_words] ioutils.mkdir(args.dir + "/volstats") outpref ="/volstats/" + args.word_file.split("/")[-1].split(".")[0] + "-" + str(args.thresh) if args.num_words != -1: outpref += "-top" + str(args.num_words) displacement_base = simple_create_representation(REP_TYPE, args.dir + "/" + str(args.disp_year) + ".bin", restricted_context=word_lists[args.end_year], thresh=args.thresh) run_parallel(args.num_procs, args.dir + outpref, args.dir + "/", years[1:], word_lists, displacement_base, args.thresh)
target = context[window_size] indices = range(0, window_size) indices.extend(range(window_size + 1, 2 * window_size + 1)) for i in indices: if i >= len(context): break pair_counts[(target, context[i])] += 1 return pair_counts if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("type") parser.add_argument("window_size", type=int) parser.add_argument("--workers", type=int, default=25) args = parser.parse_args() mkdir(OUT.format(type=args.type, window_size=args.window_size)) queue = Queue() for decade in range(1810, 2010, 10): queue.put(decade) id_map = load_pickle(DICT.format(type=args.type)) procs = [ Process(target=worker, args=[i, queue, args.window_size, args.type, id_map]) for i in range(args.workers) ] for p in procs: p.start() for p in procs: p.join()
if __name__ == '__main__': parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.") parser.add_argument("dir", help="path to directory with count data and index") parser.add_argument("word_file", help="path to sorted word file(s).", default=None) parser.add_argument("sample_file", help="path to file with sample sizes.", default=None) parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.", default=-1) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1900) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=2000) parser.add_argument("--num-boots", type=int, help="Number of bootstrap samples", default=10) parser.add_argument("--smooth", type=int, help="laplace smoothing factor", default=10) parser.add_argument("--alpha", type=float, help="confidence threshold for edges", default=0.05) parser.add_argument("--fwer-control", action='store_true', help="use Bonferroni") parser.add_argument("--id", type=int, help="run id", default=0) args = parser.parse_args() sample_sizes = ioutils.load_pickle(args.sample_file) eff_sample_size = np.percentile(np.array(sample_sizes.values()), 10) if args.smooth == 0: smooth = 0 else: smooth = 10.0**(-1*float(args.smooth)) years = range(args.start_year, args.end_year + 1) index = ioutils.load_pickle(args.dir + "/index.pkl") year_index_infos = ioutils.load_year_index_infos_common(index, years, args.word_file, num_words=args.num_words) outpref = "/bootstats-" + str(args.alpha) + "-" + str(args.fwer_control) + "/" + args.word_file.split("/")[-1].split(".")[0] if args.num_words != -1: outpref += "-top" + str(args.num_words) ioutils.mkdir(args.dir + "/" + outpref.split("/")[1]) run_parallel(args.num_procs, args.dir + outpref, args.dir + "/", year_index_infos, args.num_boots, smooth, eff_sample_size, args.alpha, args.fwer_control, args.id)
def main(proc_num, lock, download_dir, source): page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page print proc_num, "Start loop" while True: lock.acquire() work_left = False for url in urls: name = re.search('%s-(.*).gz' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in dirs: continue work_left = True print proc_num, "Name", name loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Downloading", name success = False while not success: with open(loc_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print proc_num, "Unzipping", name subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d']) print proc_num, "Going through", name year_grams = collections.defaultdict(dict) n = 0 with open(loc_dir + name) as f: for l in f: l = l.decode('utf-8').lower() split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue try: ngram = split[0].split() middle_index = len(ngram) // 2 item = ngram[middle_index] if (not item.isalpha()) or item in STOPWORDS: continue year = split[1] count = int(split[2]) if item not in year_grams[year]: year_grams[year][item] = [(l, count)] else: year_grams[year][item].append((l, count)) except: #print "!", l.strip().split() pass print proc_num, "Writing", name, n for year in year_grams: ioutils.write_pickle(year_grams[year], loc_dir + str(year) + ".pkl") print proc_num, "Deleting", name try: os.remove(loc_dir + name + '.gz') except: pass
parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=2000) parser.add_argument("--thresh", type=float, help="optional threshold", default=0) parser.add_argument("--knn", type=int, help="optional number of nearest neighbours", default=None) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) year_index_infos = ioutils.load_year_index_infos(args.dir, years, args.word_file, num_words=args.num_words) outpref = args.dir + "/secondnetstats-" + str(args.thresh) + "-" + str( args.knn) + "/" ioutils.mkdir(outpref) outpref += args.word_file.split("/")[-1].split(".")[0] if args.num_words != -1: outpref += "-top" + str(args.num_words) run_parallel(args.num_procs, outpref, args.dir + "/", year_index_infos, knn=args.knn, thresh=args.thresh)
import ioutils from cooccurrence.laplaceppmigen import run_parallel SMOOTH = 10 START_YEAR = 1900 END_YEAR = 2000 if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computed laplace smoothed normalized PPMI values.") parser.add_argument("out_dir", help="directory where data will be stored") parser.add_argument("in_dir", help="path to unmerged data") parser.add_argument("word_file", help="file of restricted word set", default=None) parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--conf-dir", help="optional file of restricted word set", default=None) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) parser.add_argument("--num-words", type=int, help="size of vocabulary", default=20000) parser.add_argument("--smooth", type=int, help="smoothing factor", default=SMOOTH) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) if args.smooth == 0: smooth = 0 else: smooth = 10.0**(-1*float(args.smooth)) index = ioutils.load_pickle(args.in_dir + "/index.pkl") year_index_infos = ioutils.load_year_index_infos_common(index, years, args.word_file, num_words=args.num_words) out_dir = args.out_dir + "/lsmooth" + str(args.smooth) ioutils.mkdir(out_dir) run_parallel(args.num_procs, out_dir + "/", args.in_dir + "/", smooth, year_index_infos, args.conf_dir)
if year == 0: merged_year_counts = year_counts for word, count in year_counts.iteritems(): if not word in merged_year_counts: merged_year_counts[word] = 0 merged_year_counts[word] += year_counts[word] write_pickle(merged_year_counts, out_dir + str(decade) + "-counts.pkl") def run_parallel(num_procs, out_dir, in_dir, decades): queue = Queue() for decade in decades: queue.put(decade) procs = [Process(target=worker, args=[i, queue, out_dir, in_dir]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Merge counts for 1gram data.") parser.add_argument("base_dir", help="base directoty. /counts should be a subdir") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)") parser.add_argument("--end-year", type=int, help="end year (inclusive)") args = parser.parse_args() decades = range(args.start_year, args.end_year+1, 10) decades.reverse() out_dir = args.base_dir + "/decades/counts/" mkdir(out_dir) run_parallel(args.num_procs, out_dir, args.base_dir + "/counts/", decades)
procs = [ Process(target=main, args=[i, lock, in_dir, years, k]) for i in range(num_procs) ] for p in procs: p.start() for p in procs: p.join() if __name__ == '__main__': parser = argparse.ArgumentParser( description="Merges years of raw 5gram data.") parser.add_argument("in_dir", help="path to unmerged data") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) parser.add_argument("--k", type=int, help="k nn thresh", default=K) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) ioutils.mkdir(args.in_dir + "/dknn-" + str(args.k)) run_parallel(args.num_procs, args.in_dir + "/", years, args.k)
os.remove(out_dir + str(year) + ".tmp.txt") def run_parallel(num_procs, out_dir, in_dir, count_dir, years, words, num_words, min_count, sample): queue = Queue() for year in years: queue.put(year) procs = [Process(target=worker, args=[i, queue, out_dir, in_dir, count_dir, words, num_words, min_count, sample]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes various frequency statistics.") parser.add_argument("out_dir") parser.add_argument("in_dir") parser.add_argument("count_dir") parser.add_argument("word_file") parser.add_argument("--workers", type=int, default=10) parser.add_argument("--num-words", type=int, default=None) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000) parser.add_argument("--year-inc", type=int, help="end year (inclusive)", default=1) parser.add_argument("--min-count", type=int, default=100) parser.add_argument("--sample", type=float, default=1e-5) args = parser.parse_args() years = range(args.start_year, args.end_year + 1, args.year_inc) words = ioutils.load_year_words(args.word_file, years) ioutils.mkdir(args.out_dir) run_parallel(args.workers, args.out_dir + "/", args.in_dir + "/", args.count_dir + "/", years, words, args.num_words, args.min_count, args.sample)
default=None) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) parser.add_argument("--num-words", type=int, help="size of vocabulary", default=20000) parser.add_argument("--smooth", type=int, help="smoothing factor", default=SMOOTH) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) if args.smooth == 0: smooth = 0 else: smooth = 10.0**(-1 * float(args.smooth)) index = ioutils.load_pickle(args.in_dir + "/index.pkl") year_index_infos = ioutils.load_year_index_infos_common( index, years, args.word_file, num_words=args.num_words) out_dir = args.out_dir + "/lsmooth" + str(args.smooth) ioutils.mkdir(out_dir) run_parallel(args.num_procs, out_dir + "/", args.in_dir + "/", smooth, year_index_infos, args.conf_dir)
if args.word_file != None: if args.index_dir == None: print >> sys.stderr, "Must specify index dir with word file!" sys.exit() word_pickle = ioutils.load_pickle(args.word_file) if not args.start_year in word_pickle: word_lists = {} for year in years: word_lists[year] = word_pickle else: word_lists = word_pickle word_infos = {} for year, word_list in word_lists.iteritems(): year_index = ioutils.load_pickle(args.index_dir + "/" + str(year) + "-index.pkl") if args.num_words != -1: word_list = word_list[: args.num_words] word_list, word_indices = get_word_indices(word_list, year_index) word_infos[year] = (word_list, word_indices) outpref = "/netstats/" + args.word_file.split("/")[-1].split(".")[0] if args.num_words != -1: outpref += "-top" + str(args.num_words) else: word_info = None outpref = "/netstats/net" if args.thresh != None: outpref += "-" + str(args.thresh) ioutils.mkdir(args.dir + "/netstats") run_parallel( args.num_procs, args.dir + outpref, args.dir + "/netstats/", args.dir + "/", years, word_info, args.thresh )
help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) parser.add_argument("--thresh", type=float, help="relevance threshold", default=THRESH) args = parser.parse_args() years = range(args.start_year + 1, args.end_year + 1) word_list = ioutils.load_pickle(args.word_file) index = ioutils.load_pickle(args.index_file) if args.num_words != -1: word_list = word_list[:args.num_words] ioutils.mkdir(args.dir + "/volstats") word_list, word_indices = get_word_indices(word_list, index) outpref = "/volstats/" + args.word_file.split("/")[-1].split( ".")[0] + "-" + str(args.thresh) if args.num_words != -1: outpref += "-top" + str(args.num_words) displacement_base = matstore.retrieve_mat_as_binary_coo_thresh( args.dir + "/" + str(args.end_year) + ".bin", args.thresh, min_size=MIN_SIZE) displacement_base = displacement_base.tocsr() run_parallel(args.num_procs, args.dir + outpref, args.dir + outpref + "-tmp", args.dir + "/", years, word_list, word_indices, displacement_base, args.thresh)
'-c-init-file', out_dir + SAVE_FILE.format(year=years[i-1]) + "-c.bin", '-threads', str(workers), '-train', in_dir + INPUT_FILE.format(year=year), '-size', str(dim), '-sample', '0', '-negative', '5', '-wvocab', in_dir + VOCAB_FILE.format(year=year), '-cvocab', in_dir + VOCAB_FILE.format(year=year), '-verbose', '2']) if __name__ == "__main__": parser = ArgumentParser("Runs sequential Glove embeddings for years") parser.add_argument("in_dir", help="Directory with cooccurrence information and vocab.") parser.add_argument("out_dir") parser.add_argument("--dim", type=int, default=300) parser.add_argument("--workers", type=int, default=50) parser.add_argument("--start-year", type=int, default=1800) parser.add_argument("--end-year", type=int, default=2000) parser.add_argument("--year-inc", type=int, default=1) parser.add_argument("--sequential", action="store_true") args = parser.parse_args() if not args.sequential: out_dir = args.out_dir + "/noinit/" else: out_dir = args.out_dir out_dir = out_dir + "/" + str(args.dim) + "/" mkdir(out_dir) years = range(args.start_year, args.end_year + 1, args.year_inc) train_years(years, args.in_dir + "/", out_dir, args.dim, args.workers, args.sequential)