Esempio n. 1
0
def precision_recall(golds, predictions, N=1):
  assert len(golds) == len(predictions)
  golds = transpose(golds)
  predictions = transpose(predictions)
  res = []

  precisions, recalls = [], []
  for golds_per_col, preds_per_col in zip(golds, predictions):
    TP = []
    FP = []
    FN = []
    for g, p in zip(golds_per_col, preds_per_col):
      tp = []
      g_ngrams = flatten(get_ngram(g, 1, N))
      p_ngrams = flatten(get_ngram(p, 1, N))
      for pn in copy.deepcopy(p_ngrams):
        if pn in g_ngrams:
          tp.append(pn)
          g_ngrams.pop(g_ngrams.index(pn))
          p_ngrams.pop(p_ngrams.index(pn))
      fn = g_ngrams
      fp = p_ngrams
      TP.append(tp)
      FP.append(fp)
      FN.append(fn)
    TP = len(flatten(TP))
    FP = len(flatten(FP))
    FN = len(flatten(FN))
    precisions.append(1.0*TP/(TP+FP))
    recalls.append(1.0*TP/(TP+FN))
  return precisions, recalls
Esempio n. 2
0
    def test(self, data):
        inputs = []
        outputs = []
        e_predictions = []
        j_predictions = []
        num_steps = 0
        epoch_time = 0.0
        for i, batch in enumerate(data):
            feed_dict = self.get_input_feed(batch, False)
            # for x,resx in zip(self.debug, self.sess.run(self.debug, feed_dict)):
            #    print x
            #    print resx.shape
            # exit(1)

            t = time.time()
            inputs.append(batch.texts)
            batch_predictions = self.sess.run(self.e_predictions, feed_dict)
            batch_predictions = np.transpose(batch_predictions, (0, 2, 1))
            e_predictions.append(batch_predictions)
            batch_predictions = self.sess.run(self.j_predictions, feed_dict)
            batch_predictions = np.transpose(batch_predictions, (0, 2, 1))
            j_predictions.append(batch_predictions)

            epoch_time += time.time() - t
            num_steps += 1
        inputs = flatten(inputs)
        e_predictions = flatten(e_predictions)
        j_predictions = flatten(j_predictions)
        inputs = [self.vocab.e_word.id2sent(u, join=True) for u in inputs]
        outputs = inputs
        e_predictions = [[self.vocab.e_word.id2sent(r, join=True) for r in p]
                         for p in e_predictions]
        j_predictions = [[self.vocab.j_word.id2sent(r, join=True) for r in p]
                         for p in j_predictions]
        return (inputs, outputs, e_predictions, j_predictions), epoch_time
Esempio n. 3
0
  def create_vocab(self, texts, vocab_path, vocab_size=0):
    '''
    Args:
     - vocab_path: The path to which the vocabulary will be restored.
     - texts: List of words.
    '''
    start_vocab = self.start_vocab
    rev_vocab, freq = zip(*collections.Counter(texts).most_common())
    rev_vocab = common.flatten([self.tokenizer(w) for w in rev_vocab])
    if type(rev_vocab[0]) == list:
      rev_vocab = common.flatten(rev_vocab)
    rev_vocab = OrderedSet(start_vocab + rev_vocab)
    if vocab_size:
      rev_vocab = OrderedSet([w for i, w in enumerate(rev_vocab) if i < vocab_size])
    freq = [0 for _ in start_vocab] + list(freq)
    freq = freq[:len(rev_vocab)]
    vocab = collections.OrderedDict()
    for i,t in enumerate(rev_vocab):
      vocab[t] = i

    # Restore vocabulary.
    if vocab_path is not None:
      with open(vocab_path, 'w') as f:
        for k, v in zip(rev_vocab, freq):
          if type(k) == unicode:
            k = k.encode('utf-8')
          f.write('%s\t%d\n' % (k,v))
    return vocab, rev_vocab
Esempio n. 4
0
 def test(self, data):
     inputs = []
     outputs = []
     speaker_changes = []
     predictions = []
     num_steps = 0
     epoch_time = 0.0
     for i, batch in enumerate(data):
         feed_dict = self.get_input_feed(batch, False)
         # for x,resx in zip(self.debug, self.sess.run(self.debug, feed_dict)):
         #    print x
         #    print resx.shape
         # exit(1)
         t = time.time()
         batch_predictions = self.sess.run(self.predictions, feed_dict)
         epoch_time += time.time() - t
         num_steps += 1
         inputs.append(batch.w_contexts)
         outputs.append(batch.responses)
         speaker_changes.append(batch.speaker_changes)
         predictions.append(batch_predictions)
     inputs = flatten(inputs)
     outputs = flatten(outputs)
     speaker_changes = flatten(speaker_changes)
     predictions = flatten(predictions)
     inputs = [[self.w_vocab.id2sent(u, join=True) for u in c]
               for c in inputs]
     outputs = [self.w_vocab.id2sent(r, join=True) for r in outputs]
     # [batch_size, utterance_max_len, beam_width] - > [batch_size, beam_width, utterance_max_len]
     predictions = [[self.w_vocab.id2sent(r, join=True) for r in zip(*p)]
                    for p in predictions]
     speaker_changes = [BooleanVocab.id2sent(sc) for sc in speaker_changes]
     return (inputs, outputs, speaker_changes, predictions), epoch_time
Esempio n. 5
0
 def oov_rate(self):
   if not self.load:
     return None
   context_tokens = common.flatten(self.symbolized.w_contexts, depth=2)
   response_tokens = common.flatten(self.symbolized.responses)
   context_tokens = Counter(context_tokens)
   response_tokens = Counter(response_tokens)
   context_unk_rate = 1.0 * context_tokens[UNK_ID] / sum(context_tokens.values())
   response_unk_rate = 1.0 * response_tokens[UNK_ID] / sum(response_tokens.values())
   return context_unk_rate, response_unk_rate
Esempio n. 6
0
 def _matching(g, p, N):
   p_ngrams = common.get_ngram(p, 1, N)
   g_ngrams = common.get_ngram(g, 1, N)
   TP = []
   FP = []
   FN = []
   for gn, pn in zip(g_ngrams, p_ngrams):
     tp, fp, fn = exact_matching(gn, pn)
     TP.extend(tp)
     FP.extend(fp)
     FN.extend(fn)
   assert len(TP + FN) == len(common.flatten(g_ngrams))
   assert len(TP + FP) == len(common.flatten(p_ngrams))
   return TP, common.flatten(g_ngrams), common.flatten(p_ngrams)#FP, FN
Esempio n. 7
0
def ngram_matching(gold, pred, N):
  # pred, gold: list of Ngrams.
  
  gold = [x for x in gold if x]
  pred = [x for x in pred if x]
  def _matching(g, p, N):
    p_ngrams = common.get_ngram(p, 1, N)
    g_ngrams = common.get_ngram(g, 1, N)
    TP = []
    FP = []
    FN = []
    for gn, pn in zip(g_ngrams, p_ngrams):
      tp, fp, fn = exact_matching(gn, pn)
      TP.extend(tp)
      FP.extend(fp)
      FN.extend(fn)
    assert len(TP + FN) == len(common.flatten(g_ngrams))
    assert len(TP + FP) == len(common.flatten(p_ngrams))
    return TP, common.flatten(g_ngrams), common.flatten(p_ngrams)#FP, FN

  # Example:
  if args.debug:
    pred = ['at $ __NUM__'.split(), "$ __NUM__ or".split()]
    gold = ["$ __NUM__ or more".split(), 'less $ __NUM__'.split(), "at least $ __NUM__".split()]
  f1 = np.zeros((len(gold), len(pred)))
  result_matrix = [[] for _ in xrange(len(gold))]
  for i in xrange(len(gold)):
    for j in xrange(len(pred)):
      tp, g_ngrams, p_ngrams = _matching(gold[i], pred[j], N)
      result_matrix[i].append(tp)
      prec = 1.0 * len(tp) / (len(g_ngrams))
      recall = 1.0 * len(tp) / (len(p_ngrams))
      f1[i][j] = 0.5 * (prec + recall)
  matching = linear_assignment(-f1)
  if args.debug:
    print pred
    print gold
    print 
    for i, j in matching:
      print gold[i], pred[j]
      print common.get_ngram(gold[i], 1, N)
      print common.get_ngram(pred[j], 1, N)
    print matching
    exit(1)
  result = []
  TP = common.flatten([result_matrix[i][j] for i, j in matching])
  gold_ngrams = common.flatten([common.flatten(common.get_ngram(g, 1, N)) for g in gold])
  pred_ngrams = common.flatten([common.flatten(common.get_ngram(p, 1, N)) for p in pred])
  return TP, gold_ngrams, pred_ngrams
Esempio n. 8
0
 def extract(self, indices, lines):
   
   # When indices are provided the length of lines and indices can be different since indices (and cluster_ids) are assigned to each NUM token appearing in a line.
   patterns_with_scores = self.get_patterns_with_score()
   if not indices == None:
     # Align.
     idx_by_line = [[] for _ in xrange(len(lines))]
     for l_idx, t_idx in indices:
       idx_by_line[l_idx].append(t_idx)
     predictions = []
     for line, idxs in zip(lines, idx_by_line):
       spans = common.flatten([extract_around_target(line, t_idx, patterns_with_scores) for t_idx in idxs])
       spans = sorted(spans, key=lambda x:-x[1])
       accepted_spans = []
       for new_span, score in spans:
         existing_spans = [span for span, _ in accepted_spans]
         if common.no_overlaps(existing_spans, new_span):
           accepted_spans.append((new_span, score))
       accepted_spans = sorted([span for span, _ in accepted_spans], key=lambda x:x[0])
       exprs = spans2exprs(accepted_spans, line)
       predictions.append(exprs)
   else:
     predictions = []
     for i, line in enumerate(lines):
       exprs = spans2exprs(get_ngram_matches(line, patterns_with_scores), line)
       predictions.append(exprs)
     #predictions = [spans2exprs(get_ngram_matches(line, patterns_with_scores), line) for line in lines]
   return predictions, None
Esempio n. 9
0
def contain_synonym_around_num(sentence, num_indices, window_width=4):
    '''
  sentence: List of string. (a lemmatized and tokenized sentence)
  num_indices: List of integer. (the indices where NUM is)
  '''
    # TODO: remove words that can be irrelevant.
    # synonyms = set([
    #   'amount', 'bill', 'cost', 'demand', 'discount', 'estimate', 'expenditure', 'expense', 'fare', 'fee', 'figure', 'output', 'pay', 'payment', 'premium', 'rate', 'return', 'tariff', 'valuation', 'worth', 'appraisal', 'assessment', 'barter', 'bounty', 'ceiling', 'charge', 'compensation', 'consideration', 'damage', 'disbursement', 'dues', 'exaction', 'hire', 'outlay', 'prize', 'quotation', 'ransom', 'reckoning', 'retail', 'reward', 'score', 'sticker', 'tab', 'ticket', 'toll', 'tune', 'wages', 'wholesale', 'appraisement',
    # ])

    # synonyms = [
    #   'amount', 'bill', 'cost', 'demand', 'discount', 'expenditure',
    #   'expense', 'fare', 'fee', 'pay', 'payment', 'premium',
    #   'tariff', 'valuation', 'worth', 'appraisal', 'assessment', 'barter', 'bounty',
    #   'ceiling', 'charge', 'compensation', 'disbursement', 'dues',
    #   'exaction', 'hire', 'outlay', 'prize', 'quotation', 'ransom', 'reckoning',
    #   'retail', 'reward', 'toll', 'tune', 'wages', 'wholesale',
    #   'appraisement'

    # ]
    synonyms = [
        'price', 'toll', 'cost', 'pay', 'worth', 'sell', 'charge', 'expend'
    ]

    # Whether words at the left side of NUM contain one of synonyms.
    # (e.g. 'cost $ 30')
    words = common.flatten(
        [sentence[max(0, idx - window_width):idx] for idx in num_indices])
    return set(synonyms).intersection(set(words))
Esempio n. 10
0
 def _get_ngram(s, ngram_range):
     stop_words = set(['.', ',', '!', '?'])
     vocab_condition = lambda x: True if NUM in x and not stop_words.intersection(
         set(x)) else False
     return flatten([[
         tuple(s[i:i + n]) for i in xrange(len(s) - n + 1)
         if vocab_condition(s[i:i + n])
     ] for n in xrange(ngram_range[0], ngram_range[1] + 1)])
Esempio n. 11
0
def main(args):
    window_width = 3
    sents = [l.replace('\n', '').split(' ') for l in open(args.input_file)]
    indices_around_num = [[(max(0,
                                i - window_width), min(len(l),
                                                       i + window_width))
                           for i, x in enumerate(l) if x == NUM]
                          for l in sents]

    words = []
    for i, (idx, s) in enumerate(zip(indices_around_num, sents)):
        #print s, len(s)
        w = common.flatten([s[x[0]:x[1]] for x in idx])
        words.append(w)
        # for idxx in idx:
        #   print idxx, #print idxx, idx[idxx[0], idxx[1]],
        # print ''
    words = common.flatten(words)
    for x in sorted(Counter(words).items(), key=lambda x: -x[1])[:2000]:
        print x
    exit(1)
    #########################
    ### Count 'tokens' around NUM
    sents = common.flatten(sents)
    for x in sorted(Counter(sents).items(), key=lambda x: -x[1])[:10000]:
        print x
    exit(1)
    ########################3333

    vectorizer = NGramVectorizer(ngram_range=(1, 4), min_freq=5)
    ngrams = vectorizer.fit_transform(sents)

    def _get_ngram(s, ngram_range):
        stop_words = set(['.', ',', '!', '?'])
        vocab_condition = lambda x: True if NUM in x and not stop_words.intersection(
            set(x)) else False
        return flatten([[
            tuple(s[i:i + n]) for i in xrange(len(s) - n + 1)
            if vocab_condition(s[i:i + n])
        ] for n in xrange(ngram_range[0], ngram_range[1] + 1)])

    ngram_range = (1, 4)
    ngrams = [_get_ngram(s, ngram_range) for s in sents]
    for ng, freq in Counter(flatten(ngrams)).most_common(10000):
        print ng, freq
Esempio n. 12
0
 def get_features(self, lines, input_filepath=None):
     return None, [
         common.flatten(
             common.get_ngram(s,
                              self.ngram_range[0],
                              self.ngram_range[1],
                              vocab_condition=self.vocab_condition))
         for s in lines
     ]
Esempio n. 13
0
 def output_training(self, features):
   counts = sorted([(k, v) for k,v in collections.Counter(common.flatten(features)).items() if not self.config.min_freq or v >= self.config.min_freq], key=lambda x:-x[1])
   if self.config.vocab_size:
     counts = counts[:self.config.vocab_size]
   pickle.dump(counts, open(self.vocab_path, 'wb'))
   with open(self.vocab_path + '.txt', 'w') as f:
     for k,v in counts:
       l = '%s\t%s' % (" ".join(k), str(v))
       f.write(l)
Esempio n. 14
0
 def create_vocab(self, source, vocab_size=0):
     '''
 Args:
  - source: List of words.
 '''
     rev_vocab, freq = zip(
         *collections.Counter(source).most_common()) if source else ([],
                                                                     None)
     rev_vocab = common.flatten([self.tokenizer(w) for w in rev_vocab])
     if rev_vocab and type(rev_vocab[0]) == list:
         rev_vocab = common.flatten(rev_vocab)
     rev_vocab = OrderedSet(self.start_vocab + list(rev_vocab))
     if vocab_size:
         rev_vocab = OrderedSet(
             [w for i, w in enumerate(rev_vocab) if i < vocab_size])
     vocab = collections.OrderedDict()
     for i, t in enumerate(rev_vocab):
         vocab[t] = i
     return vocab, rev_vocab
Esempio n. 15
0
 def preprocess(self, df):
   data = []
   for x in df.values:
     d = self.preprocess_dialogue(x, context_max_len=self.context_max_len)
     if d:
       data.append(d)
   data = common.flatten(data)
   dialogues, acts, emotions, speaker_changes, topics = list(zip(*data))
   contexts, responses, speaker_changes = zip(*[(d[:-1], d[-1], sc[:-1]) for d, sc in zip(dialogues, speaker_changes) if sc[-1] == True])
   return contexts, responses, speaker_changes
Esempio n. 16
0
 def vec2tokens(self, vectors):
     tokens = []
     current_dim = 0
     for v in self.vectorizers:
         size = v.size
         vecs = vectors[:, current_dim:current_dim + size]
         current_dim += size
         tokens.append(v.vec2tokens(vecs))
     res = [common.flatten(t) for t in zip(*tokens)]
     return res
Esempio n. 17
0
 def init_vocab(self, emb_configs, vocab_size=0):
     start_vocab = START_VOCAB
     # if self.tokenizer.lowercase:
     #   start_vocab = [x.lower for x in lowercase]
     pretrained = [
         self.load_vocab(c['path'], c['format'] == 'vec')
         for c in emb_configs
     ]
     rev_vocab = common.flatten([e.keys() for e in pretrained])
     rev_vocab = OrderedSet(
         start_vocab +
         [self.tokenizer(w, flatten=True)[0] for w in rev_vocab])
     if vocab_size:
         rev_vocab = OrderedSet(
             [w for i, w in enumerate(rev_vocab) if i < vocab_size])
     vocab = collections.OrderedDict()
     for i, t in enumerate(rev_vocab):
         vocab[t] = i
     embeddings = [
         common.flatten([emb[w] for emb in pretrained]) for w in vocab
     ]
     embeddings = np.array(embeddings)
     return vocab, rev_vocab, embeddings
Esempio n. 18
0
 def create_vocab(self, ngrams):
     min_freq = self.min_freq
     vocab = collections.Counter(common.flatten(ngrams))
     vocab = sorted(
         [(v, vocab[v])
          for v in vocab if not self.min_freq or vocab[v] >= self.min_freq],
         key=lambda x: -x[1])
     if self.vocab_size:
         vocab = vocab[:self.vocab_size]
     self.vocab = [v[0] for v in vocab]
     self.rev_vocab = collections.OrderedDict([
         (v, i) for i, v in enumerate(self.vocab)
     ])
     self._save_vocab()
Esempio n. 19
0
  def preprocess_dialogue(self, line, context_max_len=0, split_turn=False):
    idx, dialogue, act, emotion, topic = line
    dialogue = [self.preprocess_turn(x.strip(), split_turn) 
                for x in dialogue.split(_EOU) if x.strip()]
    act = [[int(a) for _ in xrange(len(d))] for a, d in zip(act.split(), dialogue)]
    emotion = [[int(e) for _ in xrange(len(d))] for e, d in zip(emotion.split(), dialogue)]
    speaker_change = [[True if i == 0 else False for i in xrange(len(d))] for d in dialogue] # Set 1 when a speaker start his/her turn, otherwise 0.

    dialogue = common.flatten(dialogue)
    act = common.flatten(act)
    emotion = common.flatten(emotion)
    speaker_change = common.flatten(speaker_change)

    # The length of the dialogue and its labels must be same.
    if len(set([len(dialogue), len(act), len(emotion)])) == 1:
      # The maximum length of a dialogue is context_max_len + 1 (response).
      dialogue_max_len = context_max_len + 1 if context_max_len else 0
      if not dialogue_max_len or len(dialogue) < dialogue_max_len:
        return [(dialogue, act, emotion, speaker_change, topic)]
      else: # Slice the dialogue.
        res = common.flatten([[(dialogue[i:i+dlen], act[i:i+dlen], emotion[i:i+dlen], speaker_change[i:i+dlen], topic) for i in xrange(len(dialogue)+1-dlen)] for dlen in range(2, dialogue_max_len+1)])
        return res
    else:
      return None
Esempio n. 20
0
def extract(input_texts):  # Deprecated
    # Codes for expression extraction (this is to be done after clustering?)
    ins_count = 0
    showed_list = []
    idx_expression = extract_expression(doc)
    if idx_expression and idx_expression not in showed_list:
        print "<L%d>\t" % i
        flattened_indice = list(set(common.flatten(idx_expression)))
        print 'Original sentence:\t',
        common.print_colored([t.text for t in doc], flattened_indice, 'red')
        print 'POS list         :\t',
        common.print_colored([t.pos_ for t in doc], flattened_indice, 'blue')
        print 'Expressions      :\t',
        print[(" ".join([doc[i].text
                         for i in indices]), indices[0], indices[-1])
              for indices in idx_expression]
        showed_list.append(idx_expression)
        ins_count += 1
    return ins_count
Esempio n. 21
0
 def get_features(self, lines, input_filepath=None):
     docs = create_spacy(lines, input_filepath)
     indices = []
     features = []
     feature_f = self.subtree2str
     for i, d in enumerate(docs):
         # Get features per a NUM token in a line. # [(token_idx0, subtrees0), ...]
         offset = 0
         feature = []
         for s in d.sents:
             feature.append([(offset + idx, [feature_f(st) for st in sts])
                             for idx, sts in self.trace(s)])
             offset += len(s)
         feature = common.flatten(feature)
         idx, feature = zip(*feature) if feature else ((-1, ), ([], ))
         indices += [(i, j) for j in idx]
         features += list(feature)
     assert len(indices) == len(features)
     return indices, features
Esempio n. 22
0
def get_ngram_matches(line, feature_scores):
  # feature_scores: default_dict[ngram] = score
  ngram_length = set([len(k) for k in feature_scores.keys()])
  min_n = min(ngram_length)
  max_n = max(ngram_length)
  res_spans = []
  if type(line) == str:
    line = line.split(' ')
  test_sent_ngrams = common.flatten(common.get_ngram(line, min_n, max_n, vocab_condition=VOCAB_CONDITION))
  possible_expr = list(set(feature_scores.keys()).intersection(test_sent_ngrams))
  possible_expr = sorted([(e, feature_scores[e]) for e in possible_expr], key=lambda x:-x[1])
  possible_expr = [e[0] for e in possible_expr]
  spans = []

  for expr in possible_expr:
    new_spans = common.get_ngram_match(line, expr)
    # Check whether the newly acquired span doesn't overlaps with the span of higher priority
    new_spans = [ns for ns in new_spans if common.no_overlaps(spans, ns)]
    spans.extend(new_spans)
  return spans
Esempio n. 23
0
  def load_data(self):
    self.load = True
    sys.stderr.write('Loading dataset from %s ...\n' % (self.path))
    df = pd.read_csv(self.path, nrows=self.max_lines)

    sys.stderr.write('Preprocessing ...\n')
    contexts, responses, speaker_changes = self.preprocess(df)

    if not self.wbase and not self.cbase:
      raise ValueError('Either \'wbase\' or \'cbase\' must be True.')

    self.speaker_changes = [self.sc_vocab.sent2id(sc) for sc in speaker_changes]

    # Separate contexts and responses into words (or chars), and convert them into their IDs.
    self.original = common.dotDict({})
    self.symbolized = common.dotDict({})

    if self.wbase:
      self.original.w_contexts = [[self.w_vocab.tokenizer(u) for u in context] 
                                  for context in contexts]
      self.symbolized.w_contexts = [[self.w_vocab.sent2id(u) for u in context] 
                                    for context in self.original.w_contexts]
    else:
      self.original.w_contexts = [None for context in contexts] 
      self.symbolized.w_contexts = [None for context in contexts] 

    if self.cbase:
      self.original.c_contexts = [[self.c_vocab.tokenizer(u) for u in context] 
                                  for context in contexts]

      self.symbolized.c_contexts = [[self.c_vocab.sent2id(u) for u in context] 
                                    for context in self.original.c_contexts]
    else:
      self.original.c_contexts = [None for context in contexts]
      self.symbolized.c_contexts = [None for context in contexts]
    self.original.responses = [self.w_vocab.tokenizer(r) for r in responses]
    self.symbolized.responses = [self.w_vocab.sent2id(r) for r in responses]

    responses = self.symbolized.responses
    w_contexts = self.symbolized.w_contexts 
    self.texts = common.flatten(w_contexts) + list(responses)
Esempio n. 24
0
    def init_vocab(self, emb_configs, vocab_size=0):
        # Combine specified pre-trained embeddings.
        pretrained = [
            self.load_vocab(c['path'],
                            skip_first=c['skip_first'],
                            vocab_size=vocab_size) for c in emb_configs
        ]
        rev_vocab = common.flatten([e.keys() for e in pretrained])
        rev_vocab = OrderedSet(self.start_vocab + list(rev_vocab))

        if vocab_size:
            rev_vocab = OrderedSet([
                w for i, w in enumerate(rev_vocab)
                if i < vocab_size + len(self.start_vocab)
            ])
        vocab = collections.OrderedDict()
        for i, t in enumerate(rev_vocab):
            vocab[t] = i
        #embeddings = [common.flatten([emb[w] for emb in pretrained]) for w in vocab]
        embeddings = [np.array([emb[w] for w in vocab]) for emb in pretrained]
        embeddings = np.concatenate(embeddings, axis=-1)
        return vocab, rev_vocab, embeddings
Esempio n. 25
0
 def get_words(self, train_data_path):
   df = pd.read_csv(train_data_path)
   data = self.dataset_type.preprocess(df, context_max_len=0)
   dialogues, _, _, _ = list(zip(*data))
   words = common.flatten([utterance.split() for utterance in common.flatten(dialogues)])
   return words
Esempio n. 26
0
def contain_currency_symbol_around_num(sentence, num_indices, window_width=4):
    words = common.flatten(
        [sentence[max(0, idx - window_width):idx] for idx in num_indices])
    return set(c_symbols).intersection(set(words))
Esempio n. 27
0
def contain_currency_name_around_num(sentence, num_indices, window_width=4):
    # Whether words at the right side of NUM contain one of synonyms.
    # (e.g. '30 dollars')
    words = common.flatten(
        [sentence[idx + 1:idx + 1 + window_width] for idx in num_indices])
    return set(c_names).intersection(set(words))
Esempio n. 28
0
 def separate_concatenated_tokens(tokens): 
   return common.flatten([x.split('|') for x in tokens])
Esempio n. 29
0
 def create_vocab(self, vocab_path, texts, vocab_size=0):
   texts = common.flatten([self.tokenizer.word2chars(word) for word in texts])
   return WordVocabulary.create_vocab(self, vocab_path, texts, vocab_size=vocab_size)
Esempio n. 30
0
 def _get_weighted_frequency(feats):
   scores = collections.defaultdict(int)
   for k, v in common.flatten(feats):
     scores[k] += v
   return sorted([(k, v*len(k)) for k,v in scores.items()], key=lambda x: -x[1])