Exemple #1
0
def worker(proc_num, queue):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.top_words(year, 5100)
        stop_words = vocab.top_words(year, 100)
        words = words.difference(stop_words)
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
                 embed.get_subembed(words.union(positive_seeds).union(negative_seeds)),
                 positive_seeds, negative_seeds,
                 score_method=polarity_induction_methods.random_walk,
                 num_boots=50, n_procs=20, return_all=True,
                 beta=0.9, nn=25)
        util.write_pickle(polarities, constants.POLARITIES + year + '-coha-freq-boot.pkl')
Exemple #2
0
def write_action_spaces(dataset_name, action_space_path, model_path, ltr=False):
    output_file = action_space_path + dataset_name + "_action_space.pkl"
    print "Writing candidate actions to " + output_file
    scores = util.load_pickle(model_path + dataset_name + "_scores.pkl")
    write_probable_pairs(dataset_name, action_space_path, scores)
    probable_pairs = util.load_pickle(action_space_path + dataset_name + '_probable_pairs.pkl')

    possible_pairs_total = 0
    action_spaces = []
    for did in scores:
        if did in probable_pairs:
            actions = defaultdict(list)
            for (m1, m2) in probable_pairs[did]:
                actions[m2].append(m1)
            if ltr:
                actions = sorted(actions.items(), cmp=lambda (ana1, ants1), (ana2, ants2):
                                 -1 if (ana1, ana2) in scores[did] else 1)
                for i in range(len(actions) - 1):
                    assert (actions[i][0], actions[i + 1][0]) in scores[did]
            else:
                actions = sorted(actions.items(), key=lambda (ana, ants):
                                 max(scores[did][(ant, ana)] - scores[did][(-1, ana)]
                                     for ant in ants))
            possible_pairs = get_possible_pairs(probable_pairs[did])
            possible_pairs_total += len(possible_pairs)
            action_spaces.append(ActionSpace(did, actions, possible_pairs))
    util.write_pickle(action_spaces, output_file)
def worker(proc_num, queue):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print proc_num, "On year", year
        words = vocab.top_words(year, 5100)
        stop_words = vocab.top_words(year, 100)
        words = words.difference(stop_words)
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
                 embed.get_subembed(words.union(positive_seeds).union(negative_seeds)),
                 positive_seeds, negative_seeds,
                 score_method=polarity_induction_methods.random_walk,
                 num_boots=50, n_procs=20, return_all=True,
                 beta=0.9, nn=25)
        util.write_pickle(polarities, constants.POLARITIES + year + '-coha-freq-boot.pkl')
def worker(proc_num, queue):
    while True:
        #        time.sleep(random.random()*10)
        try:
            name = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        if name + ".pkl" in os.listdir(POLARITIES):
            continue
        print proc_num, "Running", name
        subredditgen.main(name)
        word_dict = util.load_pickle(DICTS.format(name))
        word_dict.filter_extremes(no_above=0.1, no_below=100)
        to_keep = sorted(word_dict.dfs,
                         key=lambda w: word_dict.dfs[w],
                         reverse=True)[:5000]
        word_dict.filter_tokens(good_ids=to_keep)
        sub_vecs = create_representation(
            "SVD", constants.SUBREDDIT_EMBEDDINGS.format(name))
        pos_seeds, neg_seeds = seeds.twitter_seeds()
        sub_vecs = sub_vecs.get_subembed(
            set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))
        pols = polarity_induction_methods.bootstrap(sub_vecs,
                                                    pos_seeds,
                                                    neg_seeds,
                                                    return_all=True,
                                                    nn=25,
                                                    beta=0.9,
                                                    num_boots=50,
                                                    n_procs=10)
        util.write_pickle(pols, POLARITIES + name + ".pkl")
Exemple #5
0
def write_docs(dataset_name):
    gold, mention_to_gold = load_gold(dataset_name)
    mentions = load_mentions(dataset_name)
    docs = []
    for did in gold:
        docs.append(
            Document(did, mentions[did], gold[did], mention_to_gold[did]))
    util.write_pickle(docs, directories.DOCUMENTS + dataset_name + '_docs.pkl')
Exemple #6
0
def write_feature_names():
    util.write_pickle(
        {
            f: i
            for i, f in enumerate(
                next(util.load_json_lines(directories.RAW +
                                          'train'))["pair_feature_names"])
        }, directories.MISC + 'pair_feature_names.pkl')
Exemple #7
0
def write_genres():
    sources = set()
    for dataset_name in ["train"]:
        print "Adding sources from", dataset_name
        for d in docs(dataset_name):
            sources.add(d["document_features"]["source"])
    print sources
    util.write_pickle({source: i
                       for i, source in enumerate(sorted(sources))},
                      directories.MISC + 'genres.pkl')
    def __call__(self, finished, annealable, vBest, v2p, rCount, aCount):
        if (v2p.i % self.moduloPrint == 0) or finished:
            now = t.time()
            speed = self.moduloPrint / (now-self.t)
            if self.verbosity > 1:
                print('i=%d, ratio : %.3f, bestVal :%.3g, T=%.3g, dt=%.3fs' % (v2p.i, v2p.r, vBest, v2p.T, now-self.t))
            self.t = now
            matchL = self.config.getScheduleFromMat(annealable.bestSch)

            optState = OptState(finished, vBest, v2p, self.startTime, speed)
            write_pickle((self.config, matchL, optState), self.pklPath, lock_block=True)
Exemple #9
0
def write_words():
    words = Counter()
    for dataset_name in ["train", "dev", "test"]:
        inc = 1 if dataset_name == "train" else 0
        print "Adding words from", dataset_name
        for d in docs(dataset_name):
            for mention in d["mentions"].values():
                for w in mention["sentence"]:
                    words[word_vectors.normalize(w)] += inc
                words[word_vectors.normalize(mention["dep_relation"])] += 1
    util.write_pickle(words, directories.MISC + 'word_counts.pkl')
Exemple #10
0
    def __call__(self, finished, annealable, vBest, v2p, rCount, aCount):
        if (v2p.i % self.moduloPrint == 0) or finished:
            now = t.time()
            speed = self.moduloPrint / (now - self.t)
            if self.verbosity > 1:
                print('i=%d, ratio : %.3f, bestVal :%.3g, T=%.3g, dt=%.3fs' %
                      (v2p.i, v2p.r, vBest, v2p.T, now - self.t))
            self.t = now
            matchL = self.config.getScheduleFromMat(annealable.bestSch)

            optState = OptState(finished, vBest, v2p, self.startTime, speed)
            write_pickle((self.config, matchL, optState),
                         self.pklPath,
                         lock_block=True)
def worker(proc_num, queue):
    while True:
#        time.sleep(random.random()*10)
        try:
            name = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        if name + ".pkl" in os.listdir(POLARITIES):
            continue
        print proc_num, "Running", name
        subredditgen.main(name)
        word_dict = util.load_pickle(DICTS.format(name))
        word_dict.filter_extremes(no_above=0.1, no_below=100)
        to_keep = sorted(word_dict.dfs, key=lambda w : word_dict.dfs[w], reverse=True)[:5000]
        word_dict.filter_tokens(good_ids=to_keep)
        sub_vecs = create_representation("SVD", constants.SUBREDDIT_EMBEDDINGS.format(name))
        pos_seeds, neg_seeds = seeds.twitter_seeds()
        sub_vecs = sub_vecs.get_subembed(set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))
        pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True,
                nn=25, beta=0.9, num_boots=50, n_procs=10)
        util.write_pickle(pols, POLARITIES + name + ".pkl")
Exemple #12
0
    def run_evaluation(self):
        train_scores, train_loss, dev_pairs = evaluate(self, self.dev_docs,
                                                       self.dev_data,
                                                       "Evaluating on train")
        test_scores, test_loss, test_pairs = evaluate(self, self.test_docs,
                                                      self.test_data,
                                                      "Evaluating on test")
        epoch_stats = {
            "epoch": self.epoch,
            "n": self.n,
            "train_loss": train_loss,
            "test_loss": test_loss
        }
        epoch_stats.update(
            {"train " + k: v
             for k, v in train_scores.iteritems()})
        epoch_stats.update(
            {"test " + k: v
             for k, v in test_scores.iteritems()})
        self.history.append(epoch_stats)
        util.write_pickle(self.history, directories.CLUSTERER + 'history.pkl')
        timer.print_totals()

        test_conll = epoch_stats["test conll"]
        if self.epoch % self.write_every == 0:
            self.best_conll_window = 0
        if test_conll > self.best_conll:
            self.best_conll = test_conll
            print "New best CoNLL, saving model"
            self.save_progress(dev_pairs, test_pairs, "best")
        if test_conll > self.best_conll_window:
            self.best_conll_window = test_conll
            print "New best CoNLL in window, saving model"
            self.save_progress(
                dev_pairs, test_pairs,
                str(self.write_every * int(self.epoch / self.write_every)))
        self.model.save_weights(directories.CLUSTERER + "weights.hdf5",
                                overwrite=True)
Exemple #13
0
def write_probable_pairs(dataset_name, action_space_path, scores):
    probable_pairs = {}
    margin_removals = 0
    total_pairs = 0
    total_size = 0
    for did in util.logged_loop(scores):
        doc_scores = scores[did]
        pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1],
                       key=lambda pr: doc_scores[pr] - (-1 - 0.3*doc_scores[(-1, pr[1])]),
                       reverse=True)

        total_pairs += len(pairs)
        probable_pairs[did] = []
        for pair in pairs:
            score = doc_scores[pair] - (-1 - 0.3*doc_scores[(-1, pair[1])])
            if score < SCORE_THRESHOLD:
                break
            probable_pairs[did].append(pair)

        max_scores = {}
        for pair in probable_pairs[did]:
            if pair[1] not in max_scores:
                max_scores[pair[1]] = max(doc_scores[pair], -1 - 0.3*doc_scores[(-1, pair[1])])
            else:
                max_scores[pair[1]] = max(max_scores[pair[1]], doc_scores[pair])
        margin_removals += len(probable_pairs[did])
        probable_pairs[did] = [p for p in probable_pairs[did] if
                               doc_scores[p] - max_scores[p[1]] > MARGIN_THRESHOLD]
        margin_removals -= len(probable_pairs[did])
        total_size += len(probable_pairs[did])

    print "num docs:", len(scores)
    print "avg size without filter: {:.1f}".format(total_pairs / float(len(scores)))
    print "avg size: {:.1f}".format(total_size / float(len(scores)))
    print "margin removals size: {:.1f}".format(margin_removals / float(len(scores)))
    util.write_pickle(probable_pairs, action_space_path + dataset_name + '_probable_pairs.pkl')
    shutil.copyfile('clustering_preprocessing.py',
                    action_space_path + 'clustering_preprocessing.py')
Exemple #14
0
def write_document_vectors():
    vectors = word_vectors.WordVectors(load=True)
    for dataset_name in ["train", "dev", "test"]:
        print "Building document vectors for", dataset_name
        doc_vectors = {}
        for d in docs(dataset_name):
            sentences = {}
            did = None
            for mention_num in sorted(d["mentions"].keys(), key=int):
                m = d["mentions"][mention_num]
                did = m["doc_id"]
                if m['sent_num'] not in sentences:
                    sentences[m['sent_num']] = m['sentence']

            v = np.zeros(vectors.vectors[0].size)
            n = 0
            for s in sentences.values():
                for w in s:
                    v += vectors.vectors[vectors[w]]
                    n += 1
            doc_vectors[did] = v / n
        util.write_pickle(
            doc_vectors,
            directories.MISC + dataset_name + "_document_vectors.pkl")
Exemple #15
0
 def write(self, path=directories.RELEVANT_VECTORS):
     np.save(path + 'word_vectors', np.vstack(self.vectors))
     util.write_pickle(self.vocabulary, path + 'vocabulary.pkl')
Exemple #16
0
 def write(self, path):
     util.write_pickle(self.__dict__, path)
Exemple #17
0
class FastqIndex(object):

  file_suffix = '.fqidx.p'

  @staticmethod
  def get_index_path(fq_path):
    return fq_path + FastqIndex.file_suffix

  @property
  def bcodes(self):
    if self._bcodes == None:
      self._bcodes = set(self._bcode_off_map.keys())
    return self._bcodes

  @property
  def num_bcodes(self): return len(self.bcodes)
  @property
  def num_se(self): return self._num_se
  @property
  def num_se_bcoded(self): return self._num_se_bcoded

  def __init__(
    self,
    fq_path,
    logger=None,
  ):
    
    self.logger = logger
    self.fq_path = fq_path
    self.index_path = self.get_index_path(fq_path)
    self._bcodes = None
    self._bcode_off_map = None
    self._num_se = 0
    self._num_se_bcoded = 0

    if not os.path.isfile(self.index_path):
      self.__build_index__()
    else:
      self.__load_index__()

    self.f_map = None
    self.open()

  def open(self):
    assert self.f_map == None, "fp map already populated"
    self.f_map = {}
    self.f_map[self.fq_path] = open(self.fq_path)
    return self

  def close(self):
    for f in self.f_map.values():
      f.close()
    return

  def __enter__(self):
    return self

  def __exit__(self, exc_type, exc_value, traceback):
    self.close()

  def __build_index__(self):  
    numbytes = 0
    self._bcode_off_map = {}
    _num_se = 0
    _num_se_bcoded = 0

    assert not self.fq_path.endswith('.gz'), \
      "gzipped fq not supported"
    with open(self.fq_path) as f:
      seen_set = set()
      for bcode, reads_iter in groupby(
        util.fastq_iter(f),
        lambda(x): x[0],
      ):
        assert bcode == None or bcode not in seen_set, \
"fastq {} NOT in barcode sorted order. Ensure reads that share barcodes \
are in a block together".format(self.fq_path)
        seen_set.add(bcode)
        if bcode != None and bcode not in self._bcode_off_map:
          self._bcode_off_map[bcode] = numbytes
        bcode_num_se = 0
        for _, qname, lines in reads_iter:
          bcode_num_se += 1
          txt = ''.join(lines)
          numbytes += len(txt)
        _num_se += bcode_num_se
        if bcode != None:
          _num_se_bcoded += bcode_num_se

    self._num_se = _num_se
    self._num_se_bcoded = _num_se_bcoded
    num_bcodes = len(filter(
      lambda(b): b.endswith('-1'),
      self._bcode_off_map.keys(),
    ))

    self.logger.log('fqinfo${},{},{}'.format(
      self.num_se, len(self._bcode_off_map), num_bcodes,
    ))
    print 'writing index for fqs'
    for fq_path in [self.fq_path]:
      print '  -', fq_path
    util.write_pickle(
      self.index_path, 
      (self.num_se, self.num_se_bcoded, self._bcode_off_map),
    )
Exemple #18
0
class FastqIndex(object):

  file_suffix = '.fqidx.p'

  @staticmethod
  def get_index_path(fq_path):
    return fq_path + FastqIndex.file_suffix

  @property
  def bcode_set(self):
    if self._bcode_set == None:
      self._bcode_set = set(self._bcode_off_map.keys())
    return self._bcode_set

  def __init__(
    self,
    fq_path,
    logger=None,
  ):
    
    self.logger = logger
    self.fq_path = fq_path
    self.index_path = self.get_index_path(fq_path)
    self._bcode_set = None
    self._bcode_off_map = None

    if not os.path.isfile(self.index_path):
      self.__build_index__()
    else:
      self.__load_index__()

    self.f_map = None
    self.open()

  def open(self):
    assert self.f_map == None, "fp map already populated"
    self.f_map = {}

    if self.fq_path.endswith('.gz'):
      index_name = self.fq_path + "i"
      if not os.path.exists(index_name):
        raise Exception("Only BGZF compression is supported")

      handle = bgzf.BgzfReader(self.fq_path)
      self.gzipped = True
    else:
      handle = open(self.fq_path)
      self.gzipped = False

    self.f_map[self.fq_path] = handle
    return self

  def close(self):
    for f in self.f_map.values():
      f.close()
    return

  def __enter__(self):
    return self

  def __exit__(self, exc_type, exc_value, traceback):
    self.close()

  def __build_index__(self):  
    numbytes = 0
    self._bcode_off_map = {}
    num_pe = 0

    if self.fq_path.endswith('.gz'):
      index_name = self.fq_path + "i"
      if not os.path.exists(index_name):
        raise Exception("Only BGZF compression is supported")

      handle = bgzf.BgzfReader(self.fq_path)
    else:
      handle = open(self.fq_path)

    seen_set = set()
    for bcode, reads_iter in groupby(
      util.fastq_iter_pos(handle),
      lambda(x): x[0],
    ):
      assert bcode == None or bcode not in seen_set, \
"fastq {} NOT in barcode sorted order. Ensure reads that share barcodes \
are in a block together".format(self.fq_path)
      seen_set.add(bcode)
      for _, qname, file_pos, lines in reads_iter:
        if bcode != None and bcode not in self._bcode_off_map:
          self._bcode_off_map[bcode] = file_pos
        num_pe += 1
    handle.close()

    num_bcodes = len(filter(
      lambda(b): b.endswith('-1'),
      self._bcode_off_map.keys(),
    ))
    assert num_bcodes > 0, \
      "no barcodes specified in fastq {}".format(self.fq_path)
    self.logger.log('fqinfo${},{},{}'.format(
      num_pe, len(self._bcode_off_map), num_bcodes,
    ))
    print 'writing index for fqs'
    for fq_path in [self.fq_path]:
      print '  -', fq_path
    util.write_pickle(self.index_path, self._bcode_off_map)