コード例 #1
0
    def generate_layer(self, desc):
        """
        Generate a tf.Layer object based on a description

        Parameters
        ----------
        desc : dict
            { 'type' : str,
              'units' : int,
              'activation' : tf.nn.activation,
              'name' : str }

        Returns
        -------
        tf.Layer

        """
        if desc['type'] == 'dense':
            output = tf.layers.Dense(units=desc.get('units'), activation=desc.get('activation'), name=desc.get('name'))
            return output
            """
            output = self.dense(units=desc.get('units'), activation=desc.get('activation'), name=desc.get('name'))
            return output
            """

        elif desc['type'] == 'lstm':
            output = self.LSTM(units=desc.get('units'), activation=desc.get('activation'), name=desc.get('name'))
            return output
            
        else:
            err([], {'ex':"No Layer type selected!"})
コード例 #2
0
    def predict(self, X, binarizer=None):
        """
        Return the prediction for each input line.

        Parameters
        ----------
        X : array of str, Sentence objects, ints, floats, etc

        Returns
        -------
        pandas.Series

        """
        if X is None:
            return None
        if self.get('vocab'):
            X = self.embed(X)

        preds = self.model.predict(X)

        if self.get('binarize_predictions'):
            preds = self.binarize(preds)

        preds = pandasize(preds)
        if not isinstance(preds, pd.Series):
            err([], {'ex': "ERROR: preds not a pd.Series."})

        return preds
コード例 #3
0
def replace_charByChar(text, default):
    """
    Replace strange characters in a string char-by-char.  Takes str, Returns str
    """
    verbose = False
    try:
        final = ""
        for t in text:
            if re.search(u'[ 0-9a-zA-ZñÑ\.,\'\?\!\"\:\;\&\$\%\@\|\_]', t):
                final += t  # keep the char
                if verbose:
                    arr = [final]
                    err([arr, [t]])

            elif re.search(u'[—\-]', t):
                final += '-'  # Replace with normalized dash
                if verbose:
                    arr = [final]
                    err([arr, [t]])

            else:  # Replace the char
                final += default
                if verbose:
                    arr = [final]
                    err([arr, [t]])

        text = final
        if verbose:
            arr = [text]
            err([arr])

        return text
    except Exception as e:
        err([], {'exception': e, 'exit': True})
コード例 #4
0
ファイル: document.py プロジェクト: grahammorehead/gmutils
    def char_span(self, start, end):
        """
        Get the span from start/end char indices

        Parameters
        ----------
        start : int

        end : int

        Returns
        -------
        spacy.Span

        """
        verbose = False
        span = self.spacy_doc.char_span(start, end)
        if span is None:
            start_token = self.get_token_at_char_index(start)
            end_token = self.get_token_at_char_index(end)
            if verbose:
                err([start_token.i, end_token.i, end_token.text])
            if start_token.i == end_token.i:
                span = self.spacy_doc[start_token.i:start_token.i + 1]
            else:
                span = self.spacy_doc[start_token.i:end_token.i +
                                      1]  # Adding 1 here very important

        return span
コード例 #5
0
ファイル: nlp.py プロジェクト: grahammorehead/gmutils
def split_words(text):
    """
    Poor man's tokenization
    """
    verbose = False
    words = text.split()

    ready = []
    for word in words:
        if re.search(r'[a-zA-Z]-[a-zA-Z]', word):  # Handle hyphens
            parts = word.split('-')
            ready.append(parts[0])
            for part in parts:
                ready.append('-')
                ready.append(part)
        else:
            ready.append(word)
    if verbose: err([ready])
    words = ready

    ready = []
    for word in words:
        if re.search(r"\w'\w+$", word):  # Handle apostrophes
            starting = re.sub(r"'(\w+)$", '', word)
            ending = re.sub(r"^.*'(\w+)$", r'\1', word)
            ready.extend([starting, "'" + ending])
        else:
            ready.append(word)
    if verbose: err([ready])
    words = ready

    return words
コード例 #6
0
def get_binary_losses(preds, labels, verbose=False):
    """
    For some set of preds and labels, assumed to be binary, get the overall L1 loss for each set of labels
    """
    # Some tensors to use
    zeros = torch.zeros_like(labels)
    ones = torch.ones_like(labels)
    mask = binarize(labels).detach()
    antimask = binarize(labels, options={'reverse': True}).detach()

    preds_zero = antimask * preds
    preds_one = mask * preds
    labels_zero = antimask * labels
    labels_one = mask * labels

    if verbose:
        print("PREDS:    ", preds.cpu().data.numpy().tolist()[100:200])
        print("PREDS 0:  ", preds_zero.cpu().data.numpy().tolist()[100:200])
        print("PREDS 1:  ", preds_one.cpu().data.numpy().tolist()[100:200])
        print("LABELS:   ", labels.cpu().data.numpy().tolist()[100:200])
        print("LABELS 0: ", labels_zero.cpu().data.numpy().tolist()[100:200])
        print("LABELS 1: ", labels_one.cpu().data.numpy().tolist()[100:200])

    zeroloss = L1_LOSS(preds_zero, labels_zero)
    oneloss = L1_LOSS(preds_one, labels_one)

    if verbose:
        err([zeroloss, oneloss])

    return zeroloss, oneloss
コード例 #7
0
def squash_verbose(T):
    """
    Normalize length of vector to the range [0,1] without altering direction.
    """
    print('T:', T)
    if not T.sum().gt(0):
        return T

    sq = T.pow(2)
    if not sq.sum().gt(0):
        return T
    print('sq:', sq)

    sqsum = sq.sum(-1, keepdim=True)
    if not sqsum.sum().gt(0):
        err()
        exit()
    print('sqsum:', sqsum)

    denom = 1 + sqsum
    print('denom:', denom)
    scale = sqsum / denom
    print('scale:', scale)
    unitvec = T / torch.sqrt(sqsum)
    print('unitvec:', unitvec)
    out = scale * unitvec

    return out
コード例 #8
0
    def reduce(self, layer, arr):
        """
        Use 'layer' to connect elements in the array 'arr' of Tensors, generating a binary tree (consumes two at a time).

        if there are 3 input tensors in 'arr', layer gets created having input shape of twice that of each element in arr.  In the graph, there is now an
        output_1 connected through 'layer' to arr[1], arr[2].  After the next iteration, there is now and output_2 connected through 'layer' to arr[0], output_1

        arr[0]   arr[1]    arr[2]
          |        |         |
          |        ---layer---
          |             |
          -----layer-----
                 |

        In that way, it proceeds from end to start
        """
        if len(arr) == 1:  # if only one tensor, just send it back unchanged
            err(['Unchanged Tensor:', arr])
            return arr

        output = arr[-1]  # Contains last tensor in arr, for example: arr[2]
        i = len(arr)
        j = i - 1  # Contains the penultimate index, for example: 1
        while j > 0:
            j -= 1
            stacked = tf.concat([arr[j], output], axis=1)
            output = layer.generate(stacked)

        return output
コード例 #9
0
ファイル: nlp.py プロジェクト: grahammorehead/gmutils
def get_sentences(doc):
    """
    Even after adding a custom func to the spacy pipeline, the sentence tokenization still gets messed up.  Use this for a final split.
    """
    verbose = False
    starts = set([])
    sentences = []
    this_sentence = set([])
    """
    for offsets in find_sentence_offsets(doc):
        if verbose:  err([offsets])
        start, end = offsets
        starts.add(start)
    """
    for i, token in enumerate(doc[:-1]):
        # if token.i in starts:
        if token.is_sent_start:
            if verbose: err(["START", token])
            if len(this_sentence) > 0:
                sentences.append(doc[min(this_sentence):max(this_sentence)])
            this_sentence = set([i])
        else:
            if verbose: err([token])
            this_sentence.add(i)

    if len(this_sentence) > 0:
        sentences.append(doc[min(this_sentence):max(this_sentence)])

    return sentences
コード例 #10
0
ファイル: nlp.py プロジェクト: grahammorehead/gmutils
def sanity_check(spacy_doc):
    """
    To use for checking a parse
    """
    verbose = False
    for i, token in enumerate(spacy_doc):
        if token.is_sent_start:
            if verbose: err(["START", token])
        else:
            if verbose: err([token])
コード例 #11
0
ファイル: nlp.py プロジェクト: grahammorehead/gmutils
def parse(text):
    """
    Generate a detailed dependency parse of some text.

    """
    spacy_doc, ner, vocab = generate_spacy_data(
        text)  # Parse with spacy, get NER
    spacy_sentences = list(spacy_doc.sents)
    trees = []
    for i, sen in enumerate(spacy_sentences):
        err([sen.root])

    return spacy_doc
コード例 #12
0
def naked_words(text, verbose=False):
    """
    Split text into words and strip off punctuation and capitalization
    """
    words = split_words(text)
    out   = []
    if not isinstance(words, list):
        words = [words]
    for word in words:
        if len(word):
            out.append( simplify_for_distance(word) )
    if verbose:  err([text, words, out])
    return out
コード例 #13
0
ファイル: nlp.py プロジェクト: grahammorehead/gmutils
def combine_with_previous(previous, current):
    """
    Correct for some errors made by the spaCy sentence splitter

    Parameters
    ----------
    previous: spaCy Span

    current: spaCy Span

    Returns
    -------
    bool
    """
    verbose = False
    if verbose:
        err([
            previous.text, previous.end - previous.start, current.text,
            current.end - current.start
        ])

    # This sentence too short and not capitalized or previous is a paren
    if current.end - current.start < 3 and (current.text[0].islower() or
                                            re.search(r"\)", previous.text)):
        if verbose:
            err([[current.text]])
        return True

    # This sentence moderately short and has a close paren
    if current.end - current.start < 7 and re.search(r"\)", current.text):
        if verbose:
            err([[current.text]])
        return True

    # Previous sentence too short and is capitalized
    if previous.end - previous.start < 3 and previous.text[0].isupper():
        if verbose:
            err([[previous.text]])
        return True

    # Previous sentence had no ending punctuation
    if not ( re.search("[\.?!]$", previous.text) \
                 or re.search("[\.?!]\S$", previous.text) \
                 or re.search("[\.?!]\S\S$", previous.text) \
                 or re.search("[\.?!]\s$", previous.text) \
                 or re.search("[\.?!]\s\s$", previous.text) ):
        if verbose:
            err([[previous.text]])
        return True

    return False
コード例 #14
0
def prefix_search(line, index='default'):
    err([line])
    body = {
        "query": {
            "match": {
                "_all": {
                    "query": line,
                    "type": "phrase_prefix",
                    "max_expansions": 100
                }
            }
        }
    }
    res = es.search(index=index, body=body)
    return res['hits']['hits']
コード例 #15
0
def hasnan(T):
    """
    Determine if a tensor has a NaN or Inf in it
    """
    s = T.data.sum()
    if s != s:
        return True
    if s == get_inf():
        return True
    if s == get_neginf():
        return True

    T = T.data.cpu()
    result = (T != T).numpy()
    if result.sum():
        err([s])
        return True
    return False
コード例 #16
0
ファイル: document.py プロジェクト: grahammorehead/gmutils
    def text_to_char_offsets(self, text, start_char=0):
        """
        Search in the text of this Document for the first substring matching text

        Parameters
        ----------
        text : str

        start_char : index

        Returns
        -------
        pair of int
            start/end char offsets

        """
        verbose = False
        words = naked_words(text)  # Words to find
        dwords = self.get_text().split(' ')  # Document words
        offsets = None
        index = 0
        if verbose: err([text, words, dwords])

        for i, dw in enumerate(dwords):
            dw = simplify_for_distance(dw)
            if re.search(words[0], dw, flags=re.I):
                matched = self.matching_words_from_start(words, dwords[i:])
                if len(matched) == len(words):
                    matched_phrase = ' '.join(matched)
                    end_index = index + len(matched_phrase)
                    offsets = (index, end_index)
                    break

            if i == 0:
                index += len(dw)
            else:
                index += 1 + len(dw)

        if offsets is None:
            err([], {
                'ex': "No best span for [%s] in:\n%s" % (text, self.get_text())
            })

        return offsets  # pair of (int, int)
コード例 #17
0
def word_search(file, index='default'):
    """
    Search for each line of file in the index

    Parameters
    ----------
    index : str
        name of index where things are to be stored

    file : str
        path to file where lines are to be read

    """
    verbose = False
    if file is None:
        err(['word_search requires file'])
        exit()

    seen = Set([])
    iterator = iter_file(file)

    while True:
        try:
            line = iterator.next().rstrip()
            name = None
            try:
                e = line.split('|')
                name = e[0]
            except:
                name = line

            name = normalize(name)

            for word in name.split(' '):
                if len(word) < 3:
                    continue
                if word in seen:
                    pass  # only do each one once
                else:
                    seen.add(word)
                    search(word, {'prefix': True})

        except StopIteration:
            break
コード例 #18
0
    def loss(self, X, Y, verbose=False):
        """
        Loss function based on L1 but magnifying or diminishing the loss based on the relative occurrences of that class

        X : tensor [float, float]  (probability of each of two classes)
        Y : tensor int (which class, 0 or 1)
        """
        X = X[:, 1]
        Yd = Y.double()  # naturally a one-mask
        zero_mask = torch.abs(self.one - Yd)  # 1 for every zero element in Y
        zero_weight = self.zero_bias * zero_mask  # dilation applied to zero class
        one_weight = self.one_bias * Yd  # dilation for class 1
        weight = zero_weight + one_weight
        L = torch.abs(Y.double() - X)  # unweighted loss
        Lw = weight * L  # class-adjusted loss

        Lfinal = torch.mean(Lw)  # + range_loss  + sum_loss
        if verbose: err(["Lfinal:", Lfinal])

        return Lfinal
コード例 #19
0
    def iter_training_samples_by_thresh(self, aorb, thresh):
        """
        Yield training samples above or below a threshold
        """
        if aorb == 'above':
            while True:
                for i, row in self.x_test.iterrows():
                    y = self.y_test.loc[i]
                    if y > thresh:
                        yield row, y

        elif aorb == 'below':
            while True:
                for j, row in self.x_test.iterrows():
                    y = self.y_test.loc[j]
                    if y <= thresh:
                        yield row, y

        else:
            err([], {'ex': "Unexpected"})
コード例 #20
0
def generate_powerlaw_distro(w):
    """
    For some width 'w', generate a set of points following the desired kind of power-law distribution

    Using:  y = 1/x
    """
    x = 2  # the starting point
    end = 10
    interval = (end - x) / w
    output = []

    for i in range(w):
        output.append(1 / x)
        x += interval

    if len(output) > 256:
        err()
        exit()
    output = torchvar(output)

    return output
コード例 #21
0
ファイル: document.py プロジェクト: grahammorehead/gmutils
    def preprocess(self, vocab):
        """
        Collapse the parse tree to a more simplified format where sensible,  Identify the verb nodes and find their theta roles

        Parameters
        ----------
        vocab : dict { lemma string -> vector }

        """
        verbose = False
        if verbose: err()
        # self.agglomerate_verbs_preps(vocab)
        # self.agglomerate_compound_adj(vocab)    # Might raise the branching factor
        self.agglomerate_entities(
        )  # Treat each entity as a single thing with a single vector (important), even though it might raise branching factor
        self.delegate_to_negations()  # Can lower the branching factor
        self.agglomerate_modifiers()  # Can lower the branching factor
        # self.agglomerate_twins()                # Can raise branching factor
        self.agglomerate_verbauxes()
        self.delegate_to_conjunctions()  # Can lower the branching factor
        self.agglomerate_idioms()
        if verbose: err()

        # self.analyze_trees()                    # For debugging
        self.embed(vocab)
        if verbose: err()
コード例 #22
0
def squash_old(T):
    """
    Normalize length of vector to the range [0,1] without altering direction.
    """
    if not T.sum().gt(0):
        return T

    sq = T.pow(2)
    if not sq.sum().gt(0):
        return T

    sqsum = sq.sum(-1, keepdim=True)
    if not sqsum.sum().gt(0):
        err()
        exit()

    denom = 1 + sqsum
    scale = sqsum / denom
    unitvec = T / torch.sqrt(sqsum)
    out = scale * unitvec

    return out
コード例 #23
0
ファイル: mongo_utils.py プロジェクト: grahammorehead/gmutils
def mongo_count(db_name='default',
                collection_name='default',
                host='localhost',
                port=27017,
                user=None,
                password=None,
                authSource=None):
    """
    Return the number of documents in a collection
    """
    client = get_mongo_client(db_name=db_name,
                              collection_name=collection_name,
                              host=host,
                              port=port,
                              user=user,
                              password=password,
                              authSource=authSource)

    db = client[db_name]
    coll = db[collection_name]

    return coll.count()

    # Alternative method on mothballs:
    n = 0
    iterator = coll.find()
    try:
        for item in iterator:
            n += 1
            if n % 1000 == 0:
                print("n =", n)
    except StopIteration:
        pass
    except Exception as e:
        err([], {'exception': e})

    return n
コード例 #24
0
ファイル: document.py プロジェクト: grahammorehead/gmutils
    def matching_words_from_start(self, awords, bwords):
        """
        Recursively find a sequence of words in bwords that matches awords looking for each a-word within each b-word
        """
        verbose = False
        bword = simplify_for_distance(bwords[0])
        if verbose: err([awords[0], bword])

        if len(awords) == 1:  # Base Case
            if re.search(awords[0], bword, flags=re.I):
                if verbose: err()
                return [bwords[0]]
            else:
                if verbose: err()
                return []

        if re.search(awords[0], bword, flags=re.I):
            if verbose: err()
            matched = [bwords[0]]
            return matched + self.matching_words_from_start(
                awords[1:], bwords[1:])  # Recursion
        else:
            if verbose: err()
            return []
コード例 #25
0
ファイル: document.py プロジェクト: grahammorehead/gmutils
 def delegate_to_conjunctions(self):
     """
     For the purpose of tree simplification (lower branching factor), and logical faithfulness, take conjunction arguments and bring them in under
     the conjunction node.
     """
     verbose = False
     altered = True
     while altered:
         altered = False
         if verbose: err([altered])
         for tree in self.trees:
             for node in tree.get_conjunctions():
                 if verbose: err([tree, "calling:", node])
                 a = node.delegate_to_conjunction()
                 if a: altered = a  # only switch if going to True
             if verbose: err([tree, altered])
コード例 #26
0
ファイル: nlp.py プロジェクト: grahammorehead/gmutils
def tokenize(text):
    """
    Tokenize and do a couple extra things
    """
    verbose = False

    if verbose:
        err([text])
    final = []
    for word in tokenize_spacy(text):
        if verbose:
            err([word])
        final.extend(split_words(word))
    if verbose:
        err([final])

    return final
コード例 #27
0
ファイル: nlp.py プロジェクト: grahammorehead/gmutils
def set_sentence_starts(doc):
    """
    Adjust the elements in a spaCy Doc to record the sentence starts as output by sentence_fixer()

    This function is designed to be a spaCy pipeline element
    """
    verbose = False
    starts = set([])

    for offsets in find_sentence_offsets(doc):
        if verbose: err([offsets])
        start, end = offsets
        starts.add(start)

    for token in doc[:-1]:
        if token.i in starts:
            token.is_sent_start = True
            if verbose: err(["START", token])
        else:
            token.is_sent_start = False
            if verbose: err([token])

    return doc
コード例 #28
0
    def read_data(self, inputs):
        """
        Default function for reading data into a Dataset.  Will accept one of:
            - file
            - list of files
            - dir

        Parameters
        ----------
        inputs : DataFrame or array thereof

        Returns
        -------
        array of DataFrame (4 of them: x_train, x_test, y_train, y_test)

        """
        if isinstance(inputs, list):
            if len(inputs) == 1:
                inputs = inputs[0]
            elif len(inputs) > 1:
                return default_read_data_files(inputs)
            else:
                err([], {'ex': "ERROR: zero-length array of inputs"})

        if isinstance(inputs, str):
            if os.path.isfile(inputs):
                return default_read_data_file(inputs)
            elif os.path.isdir(inputs):
                return default_read_data_dir(inputs)
            else:
                err([], {'ex': "ERROR: inputs neither file nor dir."})

        else:
            err([], {'ex': 'Unrecognized input type: %s' % type(inputs)})

        return x_train, x_test, y_train, y_test
コード例 #29
0
def torchtensor(X, ttype=TORCH_DOUBLE, requires_grad=False):
    """
    Converts X into a PyTorch Tensor

    Parameters
    ----------
    X : in, float, or torch.Tensor

    """
    if isinstance(X, torch.Tensor):
        # If X is already a torch tensor, this just changes its type
        T = X
        if ttype == torch.DoubleTensor:  # float 64
            T = T.double()
        elif ttype == torch.FloatTensor:  # float 32
            T = T.float()
        elif ttype == torch.HalfTensor:  # float 16
            T = T.half()
        elif ttype == torch.ByteTensor:  # uint 8
            T = T.byte()
        elif ttype == torch.CharTensor:  # int 8
            T = T.char()
        elif ttype == torch.ShortTensor:  # int 16
            T = T.short()
        elif ttype == torch.IntTensor:  # int 32
            T = T.int()
        elif ttype == torch.LongTensor:  # int 64
            T = T.long()

    else:
        if isinstance(X, int) or isinstance(X, float):
            X = [X]
        if isinstance(X, list):
            T = ttype(X)
        elif scipy.sparse.issparse(X):

            X = X.todense().tolist()
            T = torchtensor(X, ttype=ttype, requires_grad=requires_grad)
            T = torch.squeeze(T)

            return T

            # not using this part for now (too much of a demand on the hardware)
            ###  SPARSE  ##################################
            X = coo_matrix(X)
            values = X.data
            indices = np.vstack((X.row, X.col))
            i = torch.LongTensor(indices)
            v = torch.DoubleTensor(values)
            shape = X.shape

            if ttype == torch.DoubleTensor:  # float 64
                T = torch.sparse.DoubleTensor(i, v,
                                              torch.Size(shape)).to_dense()
            elif ttype == torch.FloatTensor:  # float 32
                T = torch.sparse.FloatTensor(i, v,
                                             torch.Size(shape)).to_dense()
            elif ttype == torch.HalfTensor:  # float 16
                T = torch.sparse.HalfTensor(i, v, torch.Size(shape)).to_dense()
            elif ttype == torch.ByteTensor:  # uint 8
                T = torch.sparse.ByteTensor(i, v, torch.Size(shape)).to_dense()
            elif ttype == torch.CharTensor:  # int 8
                T = torch.sparse.CharTensor(i, v, torch.Size(shape)).to_dense()
            elif ttype == torch.ShortTensor:  # int 16
                T = torch.sparse.ShortTensor(i, v,
                                             torch.Size(shape)).to_dense()
            elif ttype == torch.IntTensor:  # int 32
                T = torch.sparse.IntTensor(i, v, torch.Size(shape)).to_dense()
            elif ttype == torch.LongTensor:  # int 64
                T = torch.sparse.LongTensor(i, v, torch.Size(shape)).to_dense()

            T = torch.squeeze(T)

            ################################################
        else:
            err()
            print(zzz)

    try:
        T.requires_grad = requires_grad
    except:
        pass

    return T
コード例 #30
0
    if torch.cuda.is_available():
        # torch.cuda.manual_seed_all(12345)
        INF = INF.cuda()
        negINF = negINF.cuda()
        L1_LOSS = L1_LOSS.cuda()
        NEG_ONE = NEG_ONE.cuda()
        TORCH_TWO = TORCH_TWO.cuda()
        TORCH_E = TORCH_E.cuda()
        LEAKY_RELU = LEAKY_RELU.cuda()
        TORCH_DILATION = TORCH_DILATION.cuda()

except Exception as e:
    TORCH_DOUBLE = None
    TORCH_LOSS = object
    raise
    err([], {'exception': e, 'level': 0})

##############################################################################################
# OBJECTS


class PyTorchModule(nn.Module, Object):
    """
    Basic nn.Module, with memory-efficient saving/loading
    """
    def __init__(self, options={}):
        """
        Instantiate the object
        """
        super(PyTorchModule, self).__init__()
        Object.__init__(self)