def generate_layer(self, desc): """ Generate a tf.Layer object based on a description Parameters ---------- desc : dict { 'type' : str, 'units' : int, 'activation' : tf.nn.activation, 'name' : str } Returns ------- tf.Layer """ if desc['type'] == 'dense': output = tf.layers.Dense(units=desc.get('units'), activation=desc.get('activation'), name=desc.get('name')) return output """ output = self.dense(units=desc.get('units'), activation=desc.get('activation'), name=desc.get('name')) return output """ elif desc['type'] == 'lstm': output = self.LSTM(units=desc.get('units'), activation=desc.get('activation'), name=desc.get('name')) return output else: err([], {'ex':"No Layer type selected!"})
def predict(self, X, binarizer=None): """ Return the prediction for each input line. Parameters ---------- X : array of str, Sentence objects, ints, floats, etc Returns ------- pandas.Series """ if X is None: return None if self.get('vocab'): X = self.embed(X) preds = self.model.predict(X) if self.get('binarize_predictions'): preds = self.binarize(preds) preds = pandasize(preds) if not isinstance(preds, pd.Series): err([], {'ex': "ERROR: preds not a pd.Series."}) return preds
def replace_charByChar(text, default): """ Replace strange characters in a string char-by-char. Takes str, Returns str """ verbose = False try: final = "" for t in text: if re.search(u'[ 0-9a-zA-ZñÑ\.,\'\?\!\"\:\;\&\$\%\@\|\_]', t): final += t # keep the char if verbose: arr = [final] err([arr, [t]]) elif re.search(u'[—\-]', t): final += '-' # Replace with normalized dash if verbose: arr = [final] err([arr, [t]]) else: # Replace the char final += default if verbose: arr = [final] err([arr, [t]]) text = final if verbose: arr = [text] err([arr]) return text except Exception as e: err([], {'exception': e, 'exit': True})
def char_span(self, start, end): """ Get the span from start/end char indices Parameters ---------- start : int end : int Returns ------- spacy.Span """ verbose = False span = self.spacy_doc.char_span(start, end) if span is None: start_token = self.get_token_at_char_index(start) end_token = self.get_token_at_char_index(end) if verbose: err([start_token.i, end_token.i, end_token.text]) if start_token.i == end_token.i: span = self.spacy_doc[start_token.i:start_token.i + 1] else: span = self.spacy_doc[start_token.i:end_token.i + 1] # Adding 1 here very important return span
def split_words(text): """ Poor man's tokenization """ verbose = False words = text.split() ready = [] for word in words: if re.search(r'[a-zA-Z]-[a-zA-Z]', word): # Handle hyphens parts = word.split('-') ready.append(parts[0]) for part in parts: ready.append('-') ready.append(part) else: ready.append(word) if verbose: err([ready]) words = ready ready = [] for word in words: if re.search(r"\w'\w+$", word): # Handle apostrophes starting = re.sub(r"'(\w+)$", '', word) ending = re.sub(r"^.*'(\w+)$", r'\1', word) ready.extend([starting, "'" + ending]) else: ready.append(word) if verbose: err([ready]) words = ready return words
def get_binary_losses(preds, labels, verbose=False): """ For some set of preds and labels, assumed to be binary, get the overall L1 loss for each set of labels """ # Some tensors to use zeros = torch.zeros_like(labels) ones = torch.ones_like(labels) mask = binarize(labels).detach() antimask = binarize(labels, options={'reverse': True}).detach() preds_zero = antimask * preds preds_one = mask * preds labels_zero = antimask * labels labels_one = mask * labels if verbose: print("PREDS: ", preds.cpu().data.numpy().tolist()[100:200]) print("PREDS 0: ", preds_zero.cpu().data.numpy().tolist()[100:200]) print("PREDS 1: ", preds_one.cpu().data.numpy().tolist()[100:200]) print("LABELS: ", labels.cpu().data.numpy().tolist()[100:200]) print("LABELS 0: ", labels_zero.cpu().data.numpy().tolist()[100:200]) print("LABELS 1: ", labels_one.cpu().data.numpy().tolist()[100:200]) zeroloss = L1_LOSS(preds_zero, labels_zero) oneloss = L1_LOSS(preds_one, labels_one) if verbose: err([zeroloss, oneloss]) return zeroloss, oneloss
def squash_verbose(T): """ Normalize length of vector to the range [0,1] without altering direction. """ print('T:', T) if not T.sum().gt(0): return T sq = T.pow(2) if not sq.sum().gt(0): return T print('sq:', sq) sqsum = sq.sum(-1, keepdim=True) if not sqsum.sum().gt(0): err() exit() print('sqsum:', sqsum) denom = 1 + sqsum print('denom:', denom) scale = sqsum / denom print('scale:', scale) unitvec = T / torch.sqrt(sqsum) print('unitvec:', unitvec) out = scale * unitvec return out
def reduce(self, layer, arr): """ Use 'layer' to connect elements in the array 'arr' of Tensors, generating a binary tree (consumes two at a time). if there are 3 input tensors in 'arr', layer gets created having input shape of twice that of each element in arr. In the graph, there is now an output_1 connected through 'layer' to arr[1], arr[2]. After the next iteration, there is now and output_2 connected through 'layer' to arr[0], output_1 arr[0] arr[1] arr[2] | | | | ---layer--- | | -----layer----- | In that way, it proceeds from end to start """ if len(arr) == 1: # if only one tensor, just send it back unchanged err(['Unchanged Tensor:', arr]) return arr output = arr[-1] # Contains last tensor in arr, for example: arr[2] i = len(arr) j = i - 1 # Contains the penultimate index, for example: 1 while j > 0: j -= 1 stacked = tf.concat([arr[j], output], axis=1) output = layer.generate(stacked) return output
def get_sentences(doc): """ Even after adding a custom func to the spacy pipeline, the sentence tokenization still gets messed up. Use this for a final split. """ verbose = False starts = set([]) sentences = [] this_sentence = set([]) """ for offsets in find_sentence_offsets(doc): if verbose: err([offsets]) start, end = offsets starts.add(start) """ for i, token in enumerate(doc[:-1]): # if token.i in starts: if token.is_sent_start: if verbose: err(["START", token]) if len(this_sentence) > 0: sentences.append(doc[min(this_sentence):max(this_sentence)]) this_sentence = set([i]) else: if verbose: err([token]) this_sentence.add(i) if len(this_sentence) > 0: sentences.append(doc[min(this_sentence):max(this_sentence)]) return sentences
def sanity_check(spacy_doc): """ To use for checking a parse """ verbose = False for i, token in enumerate(spacy_doc): if token.is_sent_start: if verbose: err(["START", token]) else: if verbose: err([token])
def parse(text): """ Generate a detailed dependency parse of some text. """ spacy_doc, ner, vocab = generate_spacy_data( text) # Parse with spacy, get NER spacy_sentences = list(spacy_doc.sents) trees = [] for i, sen in enumerate(spacy_sentences): err([sen.root]) return spacy_doc
def naked_words(text, verbose=False): """ Split text into words and strip off punctuation and capitalization """ words = split_words(text) out = [] if not isinstance(words, list): words = [words] for word in words: if len(word): out.append( simplify_for_distance(word) ) if verbose: err([text, words, out]) return out
def combine_with_previous(previous, current): """ Correct for some errors made by the spaCy sentence splitter Parameters ---------- previous: spaCy Span current: spaCy Span Returns ------- bool """ verbose = False if verbose: err([ previous.text, previous.end - previous.start, current.text, current.end - current.start ]) # This sentence too short and not capitalized or previous is a paren if current.end - current.start < 3 and (current.text[0].islower() or re.search(r"\)", previous.text)): if verbose: err([[current.text]]) return True # This sentence moderately short and has a close paren if current.end - current.start < 7 and re.search(r"\)", current.text): if verbose: err([[current.text]]) return True # Previous sentence too short and is capitalized if previous.end - previous.start < 3 and previous.text[0].isupper(): if verbose: err([[previous.text]]) return True # Previous sentence had no ending punctuation if not ( re.search("[\.?!]$", previous.text) \ or re.search("[\.?!]\S$", previous.text) \ or re.search("[\.?!]\S\S$", previous.text) \ or re.search("[\.?!]\s$", previous.text) \ or re.search("[\.?!]\s\s$", previous.text) ): if verbose: err([[previous.text]]) return True return False
def prefix_search(line, index='default'): err([line]) body = { "query": { "match": { "_all": { "query": line, "type": "phrase_prefix", "max_expansions": 100 } } } } res = es.search(index=index, body=body) return res['hits']['hits']
def hasnan(T): """ Determine if a tensor has a NaN or Inf in it """ s = T.data.sum() if s != s: return True if s == get_inf(): return True if s == get_neginf(): return True T = T.data.cpu() result = (T != T).numpy() if result.sum(): err([s]) return True return False
def text_to_char_offsets(self, text, start_char=0): """ Search in the text of this Document for the first substring matching text Parameters ---------- text : str start_char : index Returns ------- pair of int start/end char offsets """ verbose = False words = naked_words(text) # Words to find dwords = self.get_text().split(' ') # Document words offsets = None index = 0 if verbose: err([text, words, dwords]) for i, dw in enumerate(dwords): dw = simplify_for_distance(dw) if re.search(words[0], dw, flags=re.I): matched = self.matching_words_from_start(words, dwords[i:]) if len(matched) == len(words): matched_phrase = ' '.join(matched) end_index = index + len(matched_phrase) offsets = (index, end_index) break if i == 0: index += len(dw) else: index += 1 + len(dw) if offsets is None: err([], { 'ex': "No best span for [%s] in:\n%s" % (text, self.get_text()) }) return offsets # pair of (int, int)
def word_search(file, index='default'): """ Search for each line of file in the index Parameters ---------- index : str name of index where things are to be stored file : str path to file where lines are to be read """ verbose = False if file is None: err(['word_search requires file']) exit() seen = Set([]) iterator = iter_file(file) while True: try: line = iterator.next().rstrip() name = None try: e = line.split('|') name = e[0] except: name = line name = normalize(name) for word in name.split(' '): if len(word) < 3: continue if word in seen: pass # only do each one once else: seen.add(word) search(word, {'prefix': True}) except StopIteration: break
def loss(self, X, Y, verbose=False): """ Loss function based on L1 but magnifying or diminishing the loss based on the relative occurrences of that class X : tensor [float, float] (probability of each of two classes) Y : tensor int (which class, 0 or 1) """ X = X[:, 1] Yd = Y.double() # naturally a one-mask zero_mask = torch.abs(self.one - Yd) # 1 for every zero element in Y zero_weight = self.zero_bias * zero_mask # dilation applied to zero class one_weight = self.one_bias * Yd # dilation for class 1 weight = zero_weight + one_weight L = torch.abs(Y.double() - X) # unweighted loss Lw = weight * L # class-adjusted loss Lfinal = torch.mean(Lw) # + range_loss + sum_loss if verbose: err(["Lfinal:", Lfinal]) return Lfinal
def iter_training_samples_by_thresh(self, aorb, thresh): """ Yield training samples above or below a threshold """ if aorb == 'above': while True: for i, row in self.x_test.iterrows(): y = self.y_test.loc[i] if y > thresh: yield row, y elif aorb == 'below': while True: for j, row in self.x_test.iterrows(): y = self.y_test.loc[j] if y <= thresh: yield row, y else: err([], {'ex': "Unexpected"})
def generate_powerlaw_distro(w): """ For some width 'w', generate a set of points following the desired kind of power-law distribution Using: y = 1/x """ x = 2 # the starting point end = 10 interval = (end - x) / w output = [] for i in range(w): output.append(1 / x) x += interval if len(output) > 256: err() exit() output = torchvar(output) return output
def preprocess(self, vocab): """ Collapse the parse tree to a more simplified format where sensible, Identify the verb nodes and find their theta roles Parameters ---------- vocab : dict { lemma string -> vector } """ verbose = False if verbose: err() # self.agglomerate_verbs_preps(vocab) # self.agglomerate_compound_adj(vocab) # Might raise the branching factor self.agglomerate_entities( ) # Treat each entity as a single thing with a single vector (important), even though it might raise branching factor self.delegate_to_negations() # Can lower the branching factor self.agglomerate_modifiers() # Can lower the branching factor # self.agglomerate_twins() # Can raise branching factor self.agglomerate_verbauxes() self.delegate_to_conjunctions() # Can lower the branching factor self.agglomerate_idioms() if verbose: err() # self.analyze_trees() # For debugging self.embed(vocab) if verbose: err()
def squash_old(T): """ Normalize length of vector to the range [0,1] without altering direction. """ if not T.sum().gt(0): return T sq = T.pow(2) if not sq.sum().gt(0): return T sqsum = sq.sum(-1, keepdim=True) if not sqsum.sum().gt(0): err() exit() denom = 1 + sqsum scale = sqsum / denom unitvec = T / torch.sqrt(sqsum) out = scale * unitvec return out
def mongo_count(db_name='default', collection_name='default', host='localhost', port=27017, user=None, password=None, authSource=None): """ Return the number of documents in a collection """ client = get_mongo_client(db_name=db_name, collection_name=collection_name, host=host, port=port, user=user, password=password, authSource=authSource) db = client[db_name] coll = db[collection_name] return coll.count() # Alternative method on mothballs: n = 0 iterator = coll.find() try: for item in iterator: n += 1 if n % 1000 == 0: print("n =", n) except StopIteration: pass except Exception as e: err([], {'exception': e}) return n
def matching_words_from_start(self, awords, bwords): """ Recursively find a sequence of words in bwords that matches awords looking for each a-word within each b-word """ verbose = False bword = simplify_for_distance(bwords[0]) if verbose: err([awords[0], bword]) if len(awords) == 1: # Base Case if re.search(awords[0], bword, flags=re.I): if verbose: err() return [bwords[0]] else: if verbose: err() return [] if re.search(awords[0], bword, flags=re.I): if verbose: err() matched = [bwords[0]] return matched + self.matching_words_from_start( awords[1:], bwords[1:]) # Recursion else: if verbose: err() return []
def delegate_to_conjunctions(self): """ For the purpose of tree simplification (lower branching factor), and logical faithfulness, take conjunction arguments and bring them in under the conjunction node. """ verbose = False altered = True while altered: altered = False if verbose: err([altered]) for tree in self.trees: for node in tree.get_conjunctions(): if verbose: err([tree, "calling:", node]) a = node.delegate_to_conjunction() if a: altered = a # only switch if going to True if verbose: err([tree, altered])
def tokenize(text): """ Tokenize and do a couple extra things """ verbose = False if verbose: err([text]) final = [] for word in tokenize_spacy(text): if verbose: err([word]) final.extend(split_words(word)) if verbose: err([final]) return final
def set_sentence_starts(doc): """ Adjust the elements in a spaCy Doc to record the sentence starts as output by sentence_fixer() This function is designed to be a spaCy pipeline element """ verbose = False starts = set([]) for offsets in find_sentence_offsets(doc): if verbose: err([offsets]) start, end = offsets starts.add(start) for token in doc[:-1]: if token.i in starts: token.is_sent_start = True if verbose: err(["START", token]) else: token.is_sent_start = False if verbose: err([token]) return doc
def read_data(self, inputs): """ Default function for reading data into a Dataset. Will accept one of: - file - list of files - dir Parameters ---------- inputs : DataFrame or array thereof Returns ------- array of DataFrame (4 of them: x_train, x_test, y_train, y_test) """ if isinstance(inputs, list): if len(inputs) == 1: inputs = inputs[0] elif len(inputs) > 1: return default_read_data_files(inputs) else: err([], {'ex': "ERROR: zero-length array of inputs"}) if isinstance(inputs, str): if os.path.isfile(inputs): return default_read_data_file(inputs) elif os.path.isdir(inputs): return default_read_data_dir(inputs) else: err([], {'ex': "ERROR: inputs neither file nor dir."}) else: err([], {'ex': 'Unrecognized input type: %s' % type(inputs)}) return x_train, x_test, y_train, y_test
def torchtensor(X, ttype=TORCH_DOUBLE, requires_grad=False): """ Converts X into a PyTorch Tensor Parameters ---------- X : in, float, or torch.Tensor """ if isinstance(X, torch.Tensor): # If X is already a torch tensor, this just changes its type T = X if ttype == torch.DoubleTensor: # float 64 T = T.double() elif ttype == torch.FloatTensor: # float 32 T = T.float() elif ttype == torch.HalfTensor: # float 16 T = T.half() elif ttype == torch.ByteTensor: # uint 8 T = T.byte() elif ttype == torch.CharTensor: # int 8 T = T.char() elif ttype == torch.ShortTensor: # int 16 T = T.short() elif ttype == torch.IntTensor: # int 32 T = T.int() elif ttype == torch.LongTensor: # int 64 T = T.long() else: if isinstance(X, int) or isinstance(X, float): X = [X] if isinstance(X, list): T = ttype(X) elif scipy.sparse.issparse(X): X = X.todense().tolist() T = torchtensor(X, ttype=ttype, requires_grad=requires_grad) T = torch.squeeze(T) return T # not using this part for now (too much of a demand on the hardware) ### SPARSE ################################## X = coo_matrix(X) values = X.data indices = np.vstack((X.row, X.col)) i = torch.LongTensor(indices) v = torch.DoubleTensor(values) shape = X.shape if ttype == torch.DoubleTensor: # float 64 T = torch.sparse.DoubleTensor(i, v, torch.Size(shape)).to_dense() elif ttype == torch.FloatTensor: # float 32 T = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense() elif ttype == torch.HalfTensor: # float 16 T = torch.sparse.HalfTensor(i, v, torch.Size(shape)).to_dense() elif ttype == torch.ByteTensor: # uint 8 T = torch.sparse.ByteTensor(i, v, torch.Size(shape)).to_dense() elif ttype == torch.CharTensor: # int 8 T = torch.sparse.CharTensor(i, v, torch.Size(shape)).to_dense() elif ttype == torch.ShortTensor: # int 16 T = torch.sparse.ShortTensor(i, v, torch.Size(shape)).to_dense() elif ttype == torch.IntTensor: # int 32 T = torch.sparse.IntTensor(i, v, torch.Size(shape)).to_dense() elif ttype == torch.LongTensor: # int 64 T = torch.sparse.LongTensor(i, v, torch.Size(shape)).to_dense() T = torch.squeeze(T) ################################################ else: err() print(zzz) try: T.requires_grad = requires_grad except: pass return T
if torch.cuda.is_available(): # torch.cuda.manual_seed_all(12345) INF = INF.cuda() negINF = negINF.cuda() L1_LOSS = L1_LOSS.cuda() NEG_ONE = NEG_ONE.cuda() TORCH_TWO = TORCH_TWO.cuda() TORCH_E = TORCH_E.cuda() LEAKY_RELU = LEAKY_RELU.cuda() TORCH_DILATION = TORCH_DILATION.cuda() except Exception as e: TORCH_DOUBLE = None TORCH_LOSS = object raise err([], {'exception': e, 'level': 0}) ############################################################################################## # OBJECTS class PyTorchModule(nn.Module, Object): """ Basic nn.Module, with memory-efficient saving/loading """ def __init__(self, options={}): """ Instantiate the object """ super(PyTorchModule, self).__init__() Object.__init__(self)