Beispiel #1
0
def print_cost_estimates(commons_path, corpora_path):
  from corpora import Corpora

  train = Corpora(corpora_path, commons_path, gold=True)
  actions = Actions()
  for document in train:
    for action in document.gold:
      actions.add(action)

  train.rewind()

  cascades = [cascade_class(actions) for cascade_class in \
    [FlatCascade, ShiftCascade, ShiftMarkCascade, ShiftPropbankEvokeCascade]]
  costs = [0] * len(cascades)
  counts = [[0] * cascade.size() for cascade in cascades]
  for document in train:
    gold = document.gold
    for index, cascade in enumerate(cascades):
      cascade_gold_sequence = cascade.translate(gold)
      delegate = 0
      cost = 0
      for cascade_gold in cascade_gold_sequence:
        cost += cascade.delegates[delegate].size()
        counts[index][delegate] += 1
        if cascade_gold.is_cascade():
          delegate = cascade_gold.delegate
        else:
          delegate = 0
      costs[index] += cost
  for c, cost, cascade in zip(counts, costs, cascades):
    print "\n", cascade.__class__.__name__, "cost =", cost, "\n", \
      "Delegate invocations:", c, "\n", cascade
def initialize_corpora(data_path, data_prefix, dict_path, iterator, **kwargs):

    corpora_params = dict(data_path=data_path, prefix=data_prefix)

    if os.path.exists(dict_path):
        corpora = Corpora(dictionary=dict_path, **corpora_params)
    else:
        corpora = Corpora(**corpora_params).build()
        corpora.dictionary.save_as_text(dict_path)

    if len(corpora) == 0:
        raise ValueError(
            f'Did not find any documents from path: {data_path} for given prefix {data_prefix}'
        )

    return corpora
Beispiel #3
0
def dev_accuracy(commons_path, dev_path, tmp_folder, caspar):
  dev = Corpora(dev_path, caspar.spec.commons)
  print "Annotating dev documents", now(), mem()
  test_path = os.path.join(tmp_folder, "dev.annotated.rec")
  writer = sling.RecordWriter(test_path)
  count = 0
  start_time = time.time()

  cascade = caspar.spec.cascade
  dev_total = [0] * cascade.size()
  dev_disallowed = [0] * cascade.size()
  for document in dev:
    state, disallowed, total, trace = \
      caspar.forward(document, train=False, debug=True)
    state.write()
    trace.write()
    writer.write(str(count), state.encoded())
    count += 1
    if count % 100 == 0:
      print "  Annotated", count, "documents", now(), mem()
    for i, c in enumerate(disallowed):
      dev_total[i] += total[i]
      dev_disallowed[i] += c
  writer.close()
  end_time = time.time()
  print "Annotated", count, "documents in", "%.1f" % (end_time - start_time), \
      "seconds", now(), mem()
  print "Disallowed/Total leaf actions for", cascade.__class__.__name__
  for i, c in enumerate(dev_disallowed):
    print "Delegate", i, "disallowed", c, "out of", dev_total[i]

  return utils.frame_evaluation(gold_corpus_path=dev_path, \
                                test_corpus_path=test_path, \
                                commons_path=commons_path)
Beispiel #4
0
def train(args):
    check_present(
        args,
        ["train_corpus", "output_folder", "dev_corpus", "train_shuffle_seed"])

    train_corpus_path = args.train_corpus
    if args.train_shuffle_seed > 0:
        reader = sling.RecordReader(args.train_corpus)
        items = [(key, value) for key, value in reader]
        reader.close()
        r = random.Random(args.train_shuffle_seed)
        r.shuffle(items)
        train_corpus_path = os.path.join(args.output_folder,
                                         "train_shuffled.rec")
        writer = sling.RecordWriter(train_corpus_path)
        for key, value in items:
            writer.write(key, value)
        writer.close()
        print("Wrote shuffled train corpus to %s using seed %d" % \
              (train_corpus_path, args.train_shuffle_seed))

    # Setting an explicit seed for the sake of determinism.
    torch.manual_seed(1)

    # Make commons store if needed.
    if args.commons == '' or not os.path.exists(args.commons):
        if args.commons == '':
            fname = os.path.join(args.output_folder, "commons")
            print("Will create a commons store at", fname)
            args.commons = fname
        else:
            print("No commons found at", args.commons, ", creating it...")
        _, symbols = commons_builder.build(
            [train_corpus_path, args.dev_corpus], args.commons)
        print("Commons created at", args.commons, "with", len(symbols), \
            "symbols besides the usual ones.")

    # Make the training spec.
    spec = Spec()
    spec.build(args.commons, train_corpus_path)

    # Initialize the model with the spec and any word embeddings.
    caspar = Caspar(spec)
    embeddings_file = args.word_embeddings
    if embeddings_file == '': embeddings_file = None
    caspar.initialize(embeddings_file)

    tmp_folder = os.path.join(args.output_folder, "tmp")
    if not os.path.exists(tmp_folder):
        os.makedirs(tmp_folder)

    evaluator = partial(dev_accuracy, args.dev_corpus, tmp_folder)

    output_file_prefix = os.path.join(args.output_folder, "caspar")
    hyperparams = Hyperparams(args)
    print("Using hyperparameters:", hyperparams)

    trainer = Trainer(caspar, hyperparams, evaluator, output_file_prefix)
    train = Corpora(train_corpus_path, spec.commons, gold=True)
    trainer.train(train)
Beispiel #5
0
    def update_query(self):
        results = [r["Description"] + r['Title'] for r in self.results]
        try:
            corpora = Corpora(self.query, results, self.selectedIDs)
            print corpora
        except:
            print "NLTK Corpora not install, query expansion requires NLKT Corpuses"
        print "Augmenting query..."

        # filter - choose words that are not in the query already
        candidates = [(w, s) for w, s in corpora.getUpdatedQuery() \
                      if w not in set(self.query.split())]
        (w1, s1), (w2, s2) = candidates[0], candidates[1]
        newQueryWords = [w1, w2] if s1 == s2 else [w1]

        # build new query
        self.query = " ".join([self.query] + newQueryWords)
        print "Restarting search with query: ", self.query
        self.start()
Beispiel #6
0
    def update_query(self):
        results = [r["Description"] + r['Title'] for r in self.results]
        try:
            corpora = Corpora(self.query, results, self.selectedIDs)
            print corpora
        except:
            print "NLTK Corpora not install, query expansion requires NLKT Corpuses"
        print "Augmenting query..."

        # filter - choose words that are not in the query already
        candidates = [(w, s) for w, s in corpora.getUpdatedQuery() \
                      if w not in set(self.query.split())]
        (w1, s1), (w2, s2) = candidates[0], candidates[1]
        newQueryWords = [w1, w2] if s1 == s2 else [w1]

        # build new query
        self.query = " ".join([self.query] +  newQueryWords)
        print "Restarting search with query: ", self.query
        self.start()
Beispiel #7
0
def train(args):
  check_present(args, ["train_corpus", "output_folder", "dev_corpus"])

  # Setting an explicit seed for the sake of determinism.
  torch.manual_seed(1)

  # Make commons store if needed.
  if args.commons == '' or not os.path.exists(args.commons):
    if args.commons == '':
      fname = os.path.join(args.output_folder, "commons")
      print "Will create a commons store at", fname
      args.commons = fname
    else:
      print "No commons found at", args.commons, ", creating it..."
    _, symbols = commons_builder.build(
      [args.train_corpus, args.dev_corpus], args.commons)
    print "Commons created at", args.commons, "with", len(symbols), \
      "symbols besides the usual ones."

  # Make the training spec.
  spec = Spec()
  spec.build(args.commons, args.train_corpus)

  # Initialize the model with the spec and any word embeddings.
  caspar = Caspar(spec)
  embeddings_file = args.word_embeddings
  if embeddings_file == '': embeddings_file = None
  caspar.initialize(embeddings_file)

  tmp_folder = os.path.join(args.output_folder, "tmp")
  if not os.path.exists(tmp_folder):
    os.makedirs(tmp_folder)

  evaluator = partial(dev_accuracy,
                      args.commons,
                      args.dev_corpus,
                      tmp_folder)

  output_file_prefix = os.path.join(args.output_folder, "caspar")
  hyperparams = Hyperparams(args)
  print "Using hyperparameters:", hyperparams

  trainer = Trainer(caspar, hyperparams, evaluator, output_file_prefix)
  train = Corpora(args.train_corpus, spec.commons, gold=True)
  trainer.train(train)
Beispiel #8
0
def run(args):
    check_present(args, ["input", "parser", "output"])
    assert os.path.exists(args.input), args.input
    assert os.path.exists(args.parser), args.parser

    # Read parser flow.
    flow = Flow()
    flow.load(args.parser)

    # Initialize the spec from the flow.
    spec = Spec()
    spec.from_flow(flow)

    # Initialize the model from the flow.
    caspar = Caspar(spec)
    caspar.from_flow(flow)

    corpus = Corpora(args.input, caspar.spec.commons)
    writer = sling.RecordWriter(args.output)
    count = 0
    for document in corpus:
        state, _, _, trace = caspar.forward(document,
                                            train=False,
                                            debug=args.trace)
        state.write()
        if trace:
            trace.write()
        writer.write(str(count), state.encoded())
        count += 1
        if count % 100 == 0:
            print "Annotated", count, "documents", now(), mem()
    writer.close()
    print "Annotated", count, "documents", now(), mem()
    print "Wrote annotated documents to", args.output

    if args.evaluate:
        f = tempfile.NamedTemporaryFile(delete=False)
        fname = f.name
        caspar.spec.commons.save(fname, binary=True)
        f.close()
        eval_result = frame_evaluation(gold_corpus_path=args.input, \
            test_corpus_path=args.output, commons=caspar.spec.commons)
        os.unlink(fname)
        return eval_result
Beispiel #9
0
    def build(self, commons_path, corpora_path):
        # Prepare lexical dictionaries.
        self.words = Lexicon(self.words_normalize_digits)
        self.suffix = Lexicon(self.words_normalize_digits, oov_item=None)

        # Initialize training corpus.
        corpora = Corpora(corpora_path, commons_path)

        # Collect word and affix lexicons.
        for document in corpora:
            for token in document.tokens:
                word = token.word
                self.words.add(word)
                for s in self.get_suffixes(word):
                    assert type(s) is str
                    self.suffix.add(s)
        print "Words:", self.words.size(), "items in lexicon, including OOV"
        print "Suffix:", self.suffix.size(), "items in lexicon"

        # Load common store, but not freeze it yet. We will add the action table
        # and cascade specification to it.
        self.commons_path = commons_path
        self.commons = sling.Store()
        self.commons.load(commons_path)
        schema = sling.DocumentSchema(self.commons)

        # Prepare action table and cascade.
        self._build_action_table(corpora)
        self.cascade = cascade.ShiftMarkCascade(self.actions)
        print self.cascade

        # Save cascade specification in commons.
        _ = self.cascade.as_frame(self.commons,
                                  delegate_cell_prefix="delegate")

        # Freeze the common store.
        self.commons.freeze()

        # Add feature specs.
        self._specify_features()
def beer_tax():
    return Corpora(TAX_SYSTEM_IN_US)
def getty():
    return Corpora(GETTYSBURG)
Beispiel #12
0
def main(model, alpha, gamma, kappa, n_topics, data_path, data_prefix,
         result_path, dictionary_path, stopwords, vectors_path, batch_size,
         iterations, passes, n_words, shuffle):

    if not os.path.exists(result_path):
        raise OSError(f'Provided path {result_path} does not exist.')

    corpora_params = dict(data_path=data_path,
                          prefix=data_prefix,
                          iterator='bow',
                          stopwords=stopwords)

    if os.path.exists(dictionary_path):
        corpora = Corpora(dictionary=dictionary_path, **corpora_params)
    else:
        corpora = Corpora(**corpora_params).build()
        corpora.dictionary.save_as_text(dictionary_path)

    if len(corpora) == 0:
        raise ValueError(
            f'Did not find any documents from path: {data_path} for given prefix {data_prefix}'
        )

    MAP = dict(lda=(LDAWrapper,
                    dict(n_topics=n_topics,
                         alpha=alpha,
                         iterations=iterations,
                         passes=passes,
                         batch_size=batch_size,
                         id2word=corpora.dictionary)),
               shdp=(SHDPWrapper,
                     dict(n_topics=n_topics,
                          alpha=alpha,
                          gamma=gamma,
                          passes=passes,
                          batch_size=batch_size,
                          batch_shuffle=shuffle,
                          vector_map=load_vectors(
                              vectors_path, dictionary=corpora.dictionary),
                          num_docs=len(corpora))))

    model_class, params = MAP[model]
    topic_model = model_class(**params)

    data = [doc for doc, _ in corpora
            ]  # Gather the data since BOW reprecentation is lightweight.
    for i, seq in enumerate(data):
        if len(seq) < 1:
            raise AssertionError(f'Empty seq at index {i}')

    if shuffle:
        np.random.shuffle(data)

    topic_model.fit(data)

    model_name = str(topic_model) + f'_{data_prefix}'
    years = corpora.years
    if years:
        model_name += f'{years[0]}-{years[-1]}'
    if shuffle:
        model_name += f'_shuffled'

    path_dir = os.path.join(result_path, model_name)
    if not os.path.exists(path_dir):
        os.mkdir(path_dir)

    if hasattr(topic_model, 'save'):
        topic_model.save(os.path.join(path_dir, model_name))

    topic_df = document_topics(topic_model, corpora)

    dictionary = None
    if model == 'shdp':
        dictionary = corpora.dictionary

    words_df = model_words(topic_model, n=n_words, dictionary=dictionary)

    topics_path = os.path.join(path_dir, 'topics.csv')
    words_path = os.path.join(path_dir, 'words.csv')

    topic_df.to_csv(topics_path, index=False)
    words_df.to_csv(words_path, index=False)
import numpy as np
import matplotlib.pyplot as plt

from discord.ext import commands

# last one sitting universe
#TODO: Add channel ID HERE
channels = [0000000]

VC = ""

bot = commands.Bot(command_prefix=">")

from corpora import Corpora

Corpora(bot)
"""
------------------------------
Members
------------------------------
"""


@bot.command("docs")
async def hello(ctx):
    with open("help.txt") as afile:
        docs = afile.read()
    await ctx.send(docs)


"""
Beispiel #14
0
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from bert_embedding import BertEmbedding

sys.path.append('..')

style.use('ggplot')

from corpora import Corpora

data_path = 'M:/Projects/KeyTopicDetection/parsed'
dict_path = '../../data/cvpr_13-18_DICT.txt'

if os.path.exists(dict_path):
    corpora = Corpora(data_path=data_path,
                      prefix='CVPR',
                      iterator='bow',
                      dictionary=dict_path)
else:
    corpora = Corpora(data_path=data_path,
                      prefix='CVPR',
                      iterator='bow',
                      word_up_limit=0.75,
                      word_low_limit=20).build()
    corpora.dictionary.save_as_text(dict_path)

ctx = mx.gpu(0)
bert = BertEmbedding(ctx=ctx)


def visualize_clusters(tw, data):
    db = DBSCAN(eps=0.5, min_samples=50).fit(data)