def worker(proc_num, queue): while True: time.sleep(random.random()*10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.top_words(year, 5100) stop_words = vocab.top_words(year, 100) words = words.difference(stop_words) embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) embed_words = set(embed.iw) words = words.intersection(embed_words) polarities = polarity_induction_methods.bootstrap( embed.get_subembed(words.union(positive_seeds).union(negative_seeds)), positive_seeds, negative_seeds, score_method=polarity_induction_methods.random_walk, num_boots=50, n_procs=20, return_all=True, beta=0.9, nn=25) util.write_pickle(polarities, constants.POLARITIES + year + '-coha-freq-boot.pkl')
def write_action_spaces(dataset_name, action_space_path, model_path, ltr=False): output_file = action_space_path + dataset_name + "_action_space.pkl" print "Writing candidate actions to " + output_file scores = util.load_pickle(model_path + dataset_name + "_scores.pkl") write_probable_pairs(dataset_name, action_space_path, scores) probable_pairs = util.load_pickle(action_space_path + dataset_name + '_probable_pairs.pkl') possible_pairs_total = 0 action_spaces = [] for did in scores: if did in probable_pairs: actions = defaultdict(list) for (m1, m2) in probable_pairs[did]: actions[m2].append(m1) if ltr: actions = sorted(actions.items(), cmp=lambda (ana1, ants1), (ana2, ants2): -1 if (ana1, ana2) in scores[did] else 1) for i in range(len(actions) - 1): assert (actions[i][0], actions[i + 1][0]) in scores[did] else: actions = sorted(actions.items(), key=lambda (ana, ants): max(scores[did][(ant, ana)] - scores[did][(-1, ana)] for ant in ants)) possible_pairs = get_possible_pairs(probable_pairs[did]) possible_pairs_total += len(possible_pairs) action_spaces.append(ActionSpace(did, actions, possible_pairs)) util.write_pickle(action_spaces, output_file)
def worker(proc_num, queue): while True: time.sleep(random.random()*10) try: year = queue.get(block=False) except Empty: print proc_num, "Finished" return positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print proc_num, "On year", year words = vocab.top_words(year, 5100) stop_words = vocab.top_words(year, 100) words = words.difference(stop_words) embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) embed_words = set(embed.iw) words = words.intersection(embed_words) polarities = polarity_induction_methods.bootstrap( embed.get_subembed(words.union(positive_seeds).union(negative_seeds)), positive_seeds, negative_seeds, score_method=polarity_induction_methods.random_walk, num_boots=50, n_procs=20, return_all=True, beta=0.9, nn=25) util.write_pickle(polarities, constants.POLARITIES + year + '-coha-freq-boot.pkl')
def worker(proc_num, queue): while True: # time.sleep(random.random()*10) try: name = queue.get(block=False) except Empty: print proc_num, "Finished" return if name + ".pkl" in os.listdir(POLARITIES): continue print proc_num, "Running", name subredditgen.main(name) word_dict = util.load_pickle(DICTS.format(name)) word_dict.filter_extremes(no_above=0.1, no_below=100) to_keep = sorted(word_dict.dfs, key=lambda w: word_dict.dfs[w], reverse=True)[:5000] word_dict.filter_tokens(good_ids=to_keep) sub_vecs = create_representation( "SVD", constants.SUBREDDIT_EMBEDDINGS.format(name)) pos_seeds, neg_seeds = seeds.twitter_seeds() sub_vecs = sub_vecs.get_subembed( set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds)) pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, num_boots=50, n_procs=10) util.write_pickle(pols, POLARITIES + name + ".pkl")
def write_docs(dataset_name): gold, mention_to_gold = load_gold(dataset_name) mentions = load_mentions(dataset_name) docs = [] for did in gold: docs.append( Document(did, mentions[did], gold[did], mention_to_gold[did])) util.write_pickle(docs, directories.DOCUMENTS + dataset_name + '_docs.pkl')
def write_feature_names(): util.write_pickle( { f: i for i, f in enumerate( next(util.load_json_lines(directories.RAW + 'train'))["pair_feature_names"]) }, directories.MISC + 'pair_feature_names.pkl')
def write_genres(): sources = set() for dataset_name in ["train"]: print "Adding sources from", dataset_name for d in docs(dataset_name): sources.add(d["document_features"]["source"]) print sources util.write_pickle({source: i for i, source in enumerate(sorted(sources))}, directories.MISC + 'genres.pkl')
def __call__(self, finished, annealable, vBest, v2p, rCount, aCount): if (v2p.i % self.moduloPrint == 0) or finished: now = t.time() speed = self.moduloPrint / (now-self.t) if self.verbosity > 1: print('i=%d, ratio : %.3f, bestVal :%.3g, T=%.3g, dt=%.3fs' % (v2p.i, v2p.r, vBest, v2p.T, now-self.t)) self.t = now matchL = self.config.getScheduleFromMat(annealable.bestSch) optState = OptState(finished, vBest, v2p, self.startTime, speed) write_pickle((self.config, matchL, optState), self.pklPath, lock_block=True)
def write_words(): words = Counter() for dataset_name in ["train", "dev", "test"]: inc = 1 if dataset_name == "train" else 0 print "Adding words from", dataset_name for d in docs(dataset_name): for mention in d["mentions"].values(): for w in mention["sentence"]: words[word_vectors.normalize(w)] += inc words[word_vectors.normalize(mention["dep_relation"])] += 1 util.write_pickle(words, directories.MISC + 'word_counts.pkl')
def __call__(self, finished, annealable, vBest, v2p, rCount, aCount): if (v2p.i % self.moduloPrint == 0) or finished: now = t.time() speed = self.moduloPrint / (now - self.t) if self.verbosity > 1: print('i=%d, ratio : %.3f, bestVal :%.3g, T=%.3g, dt=%.3fs' % (v2p.i, v2p.r, vBest, v2p.T, now - self.t)) self.t = now matchL = self.config.getScheduleFromMat(annealable.bestSch) optState = OptState(finished, vBest, v2p, self.startTime, speed) write_pickle((self.config, matchL, optState), self.pklPath, lock_block=True)
def worker(proc_num, queue): while True: # time.sleep(random.random()*10) try: name = queue.get(block=False) except Empty: print proc_num, "Finished" return if name + ".pkl" in os.listdir(POLARITIES): continue print proc_num, "Running", name subredditgen.main(name) word_dict = util.load_pickle(DICTS.format(name)) word_dict.filter_extremes(no_above=0.1, no_below=100) to_keep = sorted(word_dict.dfs, key=lambda w : word_dict.dfs[w], reverse=True)[:5000] word_dict.filter_tokens(good_ids=to_keep) sub_vecs = create_representation("SVD", constants.SUBREDDIT_EMBEDDINGS.format(name)) pos_seeds, neg_seeds = seeds.twitter_seeds() sub_vecs = sub_vecs.get_subembed(set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds)) pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, num_boots=50, n_procs=10) util.write_pickle(pols, POLARITIES + name + ".pkl")
def run_evaluation(self): train_scores, train_loss, dev_pairs = evaluate(self, self.dev_docs, self.dev_data, "Evaluating on train") test_scores, test_loss, test_pairs = evaluate(self, self.test_docs, self.test_data, "Evaluating on test") epoch_stats = { "epoch": self.epoch, "n": self.n, "train_loss": train_loss, "test_loss": test_loss } epoch_stats.update( {"train " + k: v for k, v in train_scores.iteritems()}) epoch_stats.update( {"test " + k: v for k, v in test_scores.iteritems()}) self.history.append(epoch_stats) util.write_pickle(self.history, directories.CLUSTERER + 'history.pkl') timer.print_totals() test_conll = epoch_stats["test conll"] if self.epoch % self.write_every == 0: self.best_conll_window = 0 if test_conll > self.best_conll: self.best_conll = test_conll print "New best CoNLL, saving model" self.save_progress(dev_pairs, test_pairs, "best") if test_conll > self.best_conll_window: self.best_conll_window = test_conll print "New best CoNLL in window, saving model" self.save_progress( dev_pairs, test_pairs, str(self.write_every * int(self.epoch / self.write_every))) self.model.save_weights(directories.CLUSTERER + "weights.hdf5", overwrite=True)
def write_probable_pairs(dataset_name, action_space_path, scores): probable_pairs = {} margin_removals = 0 total_pairs = 0 total_size = 0 for did in util.logged_loop(scores): doc_scores = scores[did] pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1], key=lambda pr: doc_scores[pr] - (-1 - 0.3*doc_scores[(-1, pr[1])]), reverse=True) total_pairs += len(pairs) probable_pairs[did] = [] for pair in pairs: score = doc_scores[pair] - (-1 - 0.3*doc_scores[(-1, pair[1])]) if score < SCORE_THRESHOLD: break probable_pairs[did].append(pair) max_scores = {} for pair in probable_pairs[did]: if pair[1] not in max_scores: max_scores[pair[1]] = max(doc_scores[pair], -1 - 0.3*doc_scores[(-1, pair[1])]) else: max_scores[pair[1]] = max(max_scores[pair[1]], doc_scores[pair]) margin_removals += len(probable_pairs[did]) probable_pairs[did] = [p for p in probable_pairs[did] if doc_scores[p] - max_scores[p[1]] > MARGIN_THRESHOLD] margin_removals -= len(probable_pairs[did]) total_size += len(probable_pairs[did]) print "num docs:", len(scores) print "avg size without filter: {:.1f}".format(total_pairs / float(len(scores))) print "avg size: {:.1f}".format(total_size / float(len(scores))) print "margin removals size: {:.1f}".format(margin_removals / float(len(scores))) util.write_pickle(probable_pairs, action_space_path + dataset_name + '_probable_pairs.pkl') shutil.copyfile('clustering_preprocessing.py', action_space_path + 'clustering_preprocessing.py')
def write_document_vectors(): vectors = word_vectors.WordVectors(load=True) for dataset_name in ["train", "dev", "test"]: print "Building document vectors for", dataset_name doc_vectors = {} for d in docs(dataset_name): sentences = {} did = None for mention_num in sorted(d["mentions"].keys(), key=int): m = d["mentions"][mention_num] did = m["doc_id"] if m['sent_num'] not in sentences: sentences[m['sent_num']] = m['sentence'] v = np.zeros(vectors.vectors[0].size) n = 0 for s in sentences.values(): for w in s: v += vectors.vectors[vectors[w]] n += 1 doc_vectors[did] = v / n util.write_pickle( doc_vectors, directories.MISC + dataset_name + "_document_vectors.pkl")
def write(self, path=directories.RELEVANT_VECTORS): np.save(path + 'word_vectors', np.vstack(self.vectors)) util.write_pickle(self.vocabulary, path + 'vocabulary.pkl')
def write(self, path): util.write_pickle(self.__dict__, path)
class FastqIndex(object): file_suffix = '.fqidx.p' @staticmethod def get_index_path(fq_path): return fq_path + FastqIndex.file_suffix @property def bcodes(self): if self._bcodes == None: self._bcodes = set(self._bcode_off_map.keys()) return self._bcodes @property def num_bcodes(self): return len(self.bcodes) @property def num_se(self): return self._num_se @property def num_se_bcoded(self): return self._num_se_bcoded def __init__( self, fq_path, logger=None, ): self.logger = logger self.fq_path = fq_path self.index_path = self.get_index_path(fq_path) self._bcodes = None self._bcode_off_map = None self._num_se = 0 self._num_se_bcoded = 0 if not os.path.isfile(self.index_path): self.__build_index__() else: self.__load_index__() self.f_map = None self.open() def open(self): assert self.f_map == None, "fp map already populated" self.f_map = {} self.f_map[self.fq_path] = open(self.fq_path) return self def close(self): for f in self.f_map.values(): f.close() return def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def __build_index__(self): numbytes = 0 self._bcode_off_map = {} _num_se = 0 _num_se_bcoded = 0 assert not self.fq_path.endswith('.gz'), \ "gzipped fq not supported" with open(self.fq_path) as f: seen_set = set() for bcode, reads_iter in groupby( util.fastq_iter(f), lambda(x): x[0], ): assert bcode == None or bcode not in seen_set, \ "fastq {} NOT in barcode sorted order. Ensure reads that share barcodes \ are in a block together".format(self.fq_path) seen_set.add(bcode) if bcode != None and bcode not in self._bcode_off_map: self._bcode_off_map[bcode] = numbytes bcode_num_se = 0 for _, qname, lines in reads_iter: bcode_num_se += 1 txt = ''.join(lines) numbytes += len(txt) _num_se += bcode_num_se if bcode != None: _num_se_bcoded += bcode_num_se self._num_se = _num_se self._num_se_bcoded = _num_se_bcoded num_bcodes = len(filter( lambda(b): b.endswith('-1'), self._bcode_off_map.keys(), )) self.logger.log('fqinfo${},{},{}'.format( self.num_se, len(self._bcode_off_map), num_bcodes, )) print 'writing index for fqs' for fq_path in [self.fq_path]: print ' -', fq_path util.write_pickle( self.index_path, (self.num_se, self.num_se_bcoded, self._bcode_off_map), )
class FastqIndex(object): file_suffix = '.fqidx.p' @staticmethod def get_index_path(fq_path): return fq_path + FastqIndex.file_suffix @property def bcode_set(self): if self._bcode_set == None: self._bcode_set = set(self._bcode_off_map.keys()) return self._bcode_set def __init__( self, fq_path, logger=None, ): self.logger = logger self.fq_path = fq_path self.index_path = self.get_index_path(fq_path) self._bcode_set = None self._bcode_off_map = None if not os.path.isfile(self.index_path): self.__build_index__() else: self.__load_index__() self.f_map = None self.open() def open(self): assert self.f_map == None, "fp map already populated" self.f_map = {} if self.fq_path.endswith('.gz'): index_name = self.fq_path + "i" if not os.path.exists(index_name): raise Exception("Only BGZF compression is supported") handle = bgzf.BgzfReader(self.fq_path) self.gzipped = True else: handle = open(self.fq_path) self.gzipped = False self.f_map[self.fq_path] = handle return self def close(self): for f in self.f_map.values(): f.close() return def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def __build_index__(self): numbytes = 0 self._bcode_off_map = {} num_pe = 0 if self.fq_path.endswith('.gz'): index_name = self.fq_path + "i" if not os.path.exists(index_name): raise Exception("Only BGZF compression is supported") handle = bgzf.BgzfReader(self.fq_path) else: handle = open(self.fq_path) seen_set = set() for bcode, reads_iter in groupby( util.fastq_iter_pos(handle), lambda(x): x[0], ): assert bcode == None or bcode not in seen_set, \ "fastq {} NOT in barcode sorted order. Ensure reads that share barcodes \ are in a block together".format(self.fq_path) seen_set.add(bcode) for _, qname, file_pos, lines in reads_iter: if bcode != None and bcode not in self._bcode_off_map: self._bcode_off_map[bcode] = file_pos num_pe += 1 handle.close() num_bcodes = len(filter( lambda(b): b.endswith('-1'), self._bcode_off_map.keys(), )) assert num_bcodes > 0, \ "no barcodes specified in fastq {}".format(self.fq_path) self.logger.log('fqinfo${},{},{}'.format( num_pe, len(self._bcode_off_map), num_bcodes, )) print 'writing index for fqs' for fq_path in [self.fq_path]: print ' -', fq_path util.write_pickle(self.index_path, self._bcode_off_map)