Python Reach.Reach Examples, reach.Reach.Reach Python Examples

Example #1

0

Show file

    def fit_cca(self, outfile=''):

        # fits linear CCA constraint and replaces pretrained name embeddings with CCA transformed embeddings

        self.load_embeddings()
        self.extract_pretrained_prototype_embeddings()

        items, vectors = zip(
            *[(k, v) for k, v in self.pretrained_prototype_embeddings.items()
              if k in self.exemplar_to_concept])
        concept_embs = Reach(vectors, items)

        train_vectors = []
        for x in items:
            train_vectors.append(self.train_embeddings[x])
        train_vectors = Reach.normalize(train_vectors)

        cca = CCA(n_components=self.train_embeddings.size, max_iter=10000)
        cca.fit(train_vectors, concept_embs.norm_vectors)

        # transform all name embeddings using the CCA mapping
        all_name_embeddings = deepcopy(self.pretrained_name_embeddings)
        items = [x for _, x in sorted(all_name_embeddings.indices.items())]
        projected_name_embeddings = cca.transform(
            all_name_embeddings.norm_vectors)
        new_name_embeddings = Reach(projected_name_embeddings, items)

        self.pretrained_name_embeddings = new_name_embeddings
        self.load_embeddings()

        if outfile:
            with open('{}_cca.p', 'wb') as f:
                pickle.dump(cca, f)

Example #2

0

Show file

File: encoder.py Project: clips/fewshot-biomedical-names

    def synonym_retrieval(self, normalize=True, pretrained=False):

        rank_util = RankingUtils()

        # first encode
        train_names = sorted({name for name, concept in self.train_data})
        train_vectors = [
            self.pretrained_name_embeddings[x] for x in train_names
        ]
        train_embeddings = Reach(train_vectors, train_names)
        if not pretrained:
            train_embeddings = self.extract_online_dan_embeddings(
                provided_names=set(train_embeddings.items.keys()),
                normalize=normalize)
        validation_names = sorted(
            {name
             for name, concept in self.validation_data})
        validation_vectors = [
            self.pretrained_name_embeddings[x] for x in validation_names
        ]
        validation_embeddings = Reach(validation_vectors, validation_names)
        if not pretrained:
            validation_embeddings = self.extract_online_dan_embeddings(
                provided_names=set(validation_embeddings.items.keys()),
                normalize=normalize)

        # then rank training data for each validation_item
        complete_ranking = []
        for reference, concept in self.validation_data:

            # calculate distances
            reference_idx = validation_embeddings.items[reference]
            reference_vector = validation_embeddings.norm_vectors[
                reference_idx]
            scores = train_embeddings.norm_vectors.dot(reference_vector.T)

            # rank
            synonym_names = self.training_clusters[concept]
            synonym_idxs = [
                train_embeddings.items[synonym_name]
                for synonym_name in synonym_names
            ]
            ranking = np.argsort(-scores)
            ranks = [
                np.where(ranking == synonym_idx)[0][0]
                for synonym_idx in synonym_idxs
            ]
            ranks, synonyms = zip(*sorted(zip(ranks, synonym_names)))
            complete_ranking.append((reference, synonyms, ranks))

        ranking = [x[-1] for x in complete_ranking]
        print(rank_util.ranking_accuracy(ranking))
        print(rank_util.mrr(ranking))
        print(rank_util.mean_average_precision(ranking))

        return complete_ranking

Example #3

0

Show file

File: Resources.py Project: aascode/rdocChallenge

 def getConceptVectors_choi(self):
     '''
     Wraps a concept vector file from reach, so you do not have to declare this in each feature generation file
     '''
     if self.conceptVectors == None:
         self.conceptVectors = Reach(cfg.PATH_RESOURCES + 'concepts_choi.txt', header=False)
     return self.conceptVectors

Example #4

0

Show file

File: Resources.py Project: aascode/rdocChallenge

 def getWordVectors(self):
     '''
     Wraps a word vector file from reach, so you do not have to declare this in each feature generation file
     '''
     if self.wordVectors == None:
         self.wordVectors = Reach(cfg.PATH_RESOURCES+'psych_vectors.txt', header=True)
     return self.wordVectors

Example #5

0

Show file

File: encoder_base.py Project: clips/fewshot-biomedical-names

    def extract_online_dan_embeddings(self,
                                      normalize=True,
                                      verbose=False,
                                      provided_names=(),
                                      preprocess=False):

        self.model.eval()

        if provided_names:
            input_items = provided_names
            if preprocess:
                input_items = [self.preprocess(name) for name in input_items]
            embeddings = self.vectorize.create_reach_object(input_items)
        else:
            embeddings = deepcopy(self.pretrained_name_embeddings)

        input_vectors = embeddings.norm_vectors if normalize else embeddings.vectors
        input_items = [x for _, x in sorted(embeddings.indices.items())]

        # batch input items to save up on memory...
        all_embeddings = []
        batch_size = 1000
        for i in tqdm(range(0, len(input_items), batch_size),
                      disable=not verbose):
            input_batch = input_vectors[i:i + batch_size]
            input_tensor = torch.FloatTensor(input_batch).to(self.device)
            online_batch = self.model(input_tensor).detach().cpu().numpy()
            all_embeddings.append(online_batch)
        all_embeddings = np.concatenate(all_embeddings)

        online_embeddings = Reach(all_embeddings, input_items)

        return online_embeddings

Example #6

0

Show file

File: bot.py Project: usama2762/merkalysis

    def __init__(self):
        self.client = zulip.Client(
            site="https://merkalysis.zulipchat.com/api/")
        self.subscribe_all()
        self.market = Reach()

        print("done init")
        self.subkeys = ["reach"]

Example #7

0

Show file

def baseline(text, keep_n=10000):
    """Create a one-hot encoded baseline vector space."""
    c = CountVectorizer(text, max_features=keep_n)
    c.fit(text)

    words = c.get_feature_names()
    words = ["UNK"] + words
    vectors = np.eye(len(words))
    return Reach(vectors, list(words), unk_index=0)

Example #8

0

Show file

def compose(documents,
            embeddings,
            window,
            context_function,
            use_focus=True,
            norm=False):
    """
    Map phrases from sentences to vectors.

    Parameters
    ==========
    documents : list of lists
        A list of lists, where each sublist contains 2 lists of the same
        length, where the first list contains the tokens of a text, and
        the second list contains the BIO of the NP chunks for said text.
    embeddings : Reach
        A reach instance which contains the embeddings you want to use to
        vectorize.
    window : int
        The window size to use.
    context_function : function
        The function which is used to weigh the contexts. Must take a 2D
        matrix and return a 2D matrix of the same shape.
    use_focus : bool, optional, default True
        Whether to vectorize the focus word.
    norm : bool, optional, default False
        Whether to use the unit vectors to compose.

    Returns
    =======
    phrases : Reach
        A reach instance containing the phrases and their vectors.

    """
    bio_regex = re.compile(r"BI*")

    phrases, vectors = [], []

    for idx, (txt, bio) in enumerate(documents):

        txt = " ".join(txt).lower().split()
        bio = "".join([x.split("-")[0] for x in bio])
        for t in bio_regex.finditer(bio):
            b, e = t.span()
            phrase_string, vector = create_phrase_vector(
                txt, b, e, window, embeddings, np.mean, np.mean,
                context_function, use_focus, norm)

            # Phrase string needs to be augmented with index to make
            # the dictionary mapping not overwrite itself.
            phrase_string = "{}-{}".format(phrase_string, len(phrases))
            phrases.append(phrase_string)
            vectors.append(vector)

    return Reach(vectors, phrases)

Example #9

0

Show file

File: intrinsic.py Project: stephantul/conch

def label_chunks(gold_bio, phrase_bio, embeddings):
    """
    Find a label for each phrase chunk based on the gold chunks.

    Each phrase chunk which does not correctly overlap with a gold chunk
    is pruned from the embedding space and added as a false positive.

    Parameters
    ==========
    gold_bio : list of string
        The token-level BIO string for the gold standard data. Must include
        classes on the B and I labels (e.g. B-Test, I-test).
    phrase_bio : list of string
        The token-level BIO string for the phrase data. Does not include
        any classes on the B and I labels. (e.g. B and I instead of B-test).
    embeddings : Reach
        The embedding space for the phrases.

    Returns
    =======
    pruned_embeddings : Reach
        The embedding space with any false positive phrases removed.
    word2label : dict
        Dictionary mapping from the name of each phrase to a label.
    chunk_labels : np.array
        An aligned list from phrases to labels.
    results : list of tuples
        An intermediate list of false positives and false negatives constructed
        during matching the gold and phrase chunks.

    """
    # Create a list of (start, end, label) tuples from BIO.
    phrase_chunks = bio_to_index(phrase_bio)
    gold_chunks = bio_to_index(gold_bio)
    phrase_labels, results = link_chunks_to_gold(phrase_chunks, gold_chunks)

    # False positives get assigned the label "o", so they need to be removed.
    allowed = [i for i, v in enumerate(phrase_labels) if v != "o"]

    if results:
        t, _ = zip(*results)
        print("Num false neg: {0}".format(Counter(t)))

    # We assume alignment between chunks and words.
    vectors = embeddings.norm_vectors[allowed]
    chunk_labels = np.array(phrase_labels)[allowed]
    words = [embeddings.indices[x] for x in allowed]
    words2label = {
        embeddings.indices[x]: chunk_labels[idx]
        for idx, x in enumerate(allowed)
    }

    pruned_embeddings = Reach(vectors, words)

    return pruned_embeddings, words2label, chunk_labels, results

Example #10

0

Show file

File: network.py Project: jsta/pysparrow

    def __init__(self, path, version='0'):
        g = DiGraph()
        gaged_reaches = []
        db = openFile(path, "r")
        table = db.getNode('/', 'networks/network' + str(version))
        reaches = {}
        #read data out of file
        for row in table:
            if str(row['ComID']) != '-1':
                reaches[row['ComID']] = Reach(self, row)
            else:
                reaches[row['ComID']] = '-1'
                g.add_edge(Reach(self, row), '-1')
            if row['MonitoredFlag'] == '1':
                gaged_reaches.append(row['ComID'])
        db.close()
        #make network
        for comid in reaches.keys():
            to_comID = reaches[comid]._ToComID
            if to_comID != '-1':
                g.add_edge(reaches[comid], reaches[to_comID])
            else:
                g.add_edge(reaches[comid], -1)
        self._g_unbroken = g.copy()
        self._g_unbroken_reverse = self._g_unbroken.reverse()

        #break upstream of monitored reaches
        for i in gaged_reaches:
            if i != '-1':
                up = g.predecessors(reaches[i])
                for j in up:
                    if j != '-1':
                        g.delete_edge(j, reaches[i])
                    else:
                        g.delete_edge(j, '-1')
        self._g = g
        self._g_rev = g.reverse()
        self._version = str(version)
        self._path = str(path)
        self._reaches = reaches
        db.close()

Example #11

0

Show file

    def create_reach_object(self, strings, normalize=False, outfile=''):

        vectors = []
        for s in tqdm(strings):
            token_embs = self.vectorize_string(s, norm=normalize)
            vector = np.average(np.array(token_embs), axis=0)
            vectors.append(vector)

        reach_object = Reach(vectors, list(strings))

        if outfile:
            reach_object.save_fast_format(outfile)

        return reach_object

Example #12

0

Show file

def create_concepts(concepts,
                    embeddings,
                    include_np=True):
    """Create concepts by summing over descriptions in embedding spaces."""
    # Gold standard labels for concepts:
    sty = json.load(open("data/concept_label.json"))

    concept_names = []
    vectors = []

    concept_labels = []

    for name, descriptions in tqdm(list(concepts.items())):

        try:
            label = sty[name]
        except KeyError:
            continue

        if not include_np and label == "np":
            continue

        concept = []

        for idx, desc in enumerate(descriptions):

            try:
                desc = desc.lower().split()
                # desc = [x for x in desc if x not in STOP_WORDS]
                vec = embeddings.vectorize(desc, remove_oov=True)
                if not np.any(vec):
                    continue
                concept.append(np.mean(vec, axis=0))
            except ValueError:
                pass

        if not concept:
            continue

        concept_labels.append(label)
        name = "{0}_{1}".format(name, "_".join(descriptions[0].split()))
        concept_names.append(name)
        vectors.append(np.array(concept).mean(axis=0))

    r = Reach(np.array(vectors), concept_names)
    name2label = dict(zip(concept_names, concept_labels))

    return r, name2label

Example #13

0

Show file

File: vectorize.py Project: clips/fewshot-biomedical-names

    def create_reach_object(self, names, outfile=''):

        names = sorted(names)

        vectors = []
        for name in tqdm(names):
            token_embs = self.vectorize_string(name, norm=False)
            vector = np.average(np.array(token_embs), axis=0)
            vectors.append(vector)

        reach_object = Reach(vectors, names)

        if outfile:
            reach_object.save_fast_format(outfile)

        return reach_object

Example #14

0

Show file

    def extract_online_lstm_embeddings(self,
                                       prune=False,
                                       normalize=True,
                                       verbose=False,
                                       provided_names=(),
                                       preprocess=False):

        self.model.eval()

        if provided_names:
            input_items = provided_names
            if preprocess:
                input_items = [self.preprocess(name) for name in input_items]
        else:
            embeddings = deepcopy(self.sampling.pretrained_name_embeddings)
            if prune:
                names_to_prune = set(
                    self.sampling.exemplar_to_concept.keys()).union(
                        self.sampling.validation_references.keys())
                embeddings.prune(names_to_prune)
            input_items = [x for _, x in sorted(embeddings.indices.items())]

        # batch input items to save up on memory...
        all_embeddings = []
        batch_size = 500 if self.hidden_size >= 9600 else 1000
        for i in tqdm(range(0, len(input_items), batch_size),
                      disable=not verbose):
            input_batch = input_items[i:i + batch_size]
            input_vectors = []
            for item in input_batch:
                vector = self.sampling.vectorize_string(item, norm=normalize)
                input_vectors.append(torch.FloatTensor(vector).to(self.device))

            # pass through LSTM network
            lstm_embeddings = self.forward_lstm(input_vectors)
            online_batch = lstm_embeddings.detach().cpu().numpy()

            # add batch
            all_embeddings.append(online_batch)

        # convert to embeddings
        all_embeddings = np.concatenate(all_embeddings)

        online_embeddings = Reach(all_embeddings, input_items)

        return online_embeddings

Example #15

0

Show file

    def TraverseReach(self, cur):
        reachID = len(self.l_Reach)  # get a new reach-ID
        reach = Reach(reachID)  # create a new Reach type variable with that ID

        self.array_done[cur[0], cur[
            1]] = reachID  # mark array done for that first non-junction skeleton point by its reach-ID
        list_nextPoints = self.getNextPoints(
            cur)  # calculate the next non-junction skeleton point
        while (len(list_nextPoints) == 1):  # if only one such point exists
            cur = list_nextPoints[0]
            reach.addToStream(cur)  # Run addToStream() on that point
            self.array_done[cur[0], cur[
                1]] = reachID  # give this new point the same reach-ID a its previous point
            list_nextPoints = self.getNextPoints(
                cur)  # proceed to the next point

        self.l_Reach.append(
            reach
        )  # append this new reach ending with the point - cur, to the list - l_Reach
        return cur, reach

Example #16

0

Show file

File: concept_vectors.py Project: dhikum/conch

def create_concepts(concepts, embeddings, include_np=True, labels=None):
    """Create concepts by summing over descriptions in embedding spaces."""
    # Gold standard labels for concepts:
    concept_names = []
    vectors = []

    for name, descriptions in tqdm(list(concepts.items())):

        if labels is not None:
            try:
                label = sty[name]
            except KeyError:
                continue

        if not include_np and label == "np":
            continue

        concept = []

        for idx, desc in enumerate(descriptions):

            try:
                desc = desc.lower().split()
                # desc = [x for x in desc if x not in STOP_WORDS]
                vec = embeddings.vectorize(desc, remove_oov=True)
                if not np.any(vec):
                    continue
                concept.append(np.mean(vec, axis=0))
            except ValueError:
                pass

        if not concept:
            continue

        concept_names.append(name)
        vectors.append(np.array(concept).mean(axis=0))

    r = Reach(np.array(vectors), concept_names)

    return r

Example #17

0

Show file

File: encoder.py Project: clips/fewshot-biomedical-names

    def create_cluster_prototypes(self,
                                  provided_embeddings=None,
                                  total=False,
                                  pretrained=True):

        if provided_embeddings != None:
            embeddings = provided_embeddings
        else:
            if pretrained:
                embeddings = self.pretrained_name_embeddings
            else:
                embeddings = self.extract_online_dan_embeddings(prune=False)

        clusters = self.clusters if total else self.training_clusters

        print('Creating cluster prototypes...')
        cluster_prototypes = {}
        for label, strings in clusters.items():
            strings = set(strings).intersection(self.training_names)
            cluster_prototypes[label] = self.create_prototype(
                strings, embeddings)
        items, vectors = zip(*cluster_prototypes.items())
        self.cluster_prototypes = Reach(vectors, items)

Example #18

0

Show file

    def __init__(self, river, len_dang_arcs, fast):
        # Initialize the variables
        self.array_skeleton = []
        self.row = 0
        self.col = 0
        self.length_dangling_arcs = len_dang_arcs
        self.array_Junction = []
        self.array_done = []
        self.list_Junction = []
        self.l_Reach = []
        self.flag_fast = fast

        # Find the skeleton and extract the river boundary
        # By the end of this constructor function we have a skeleton of the river with no dangling arcs, with junction points and reaches identified

        # 1. Get the river image in the form of numpy array -------------------
        array_Image = river.getRiver()

        # 2. Find skeleton for the river --------------------------------------
        print("Finding the skeleton")
        self.array_skeleton = morphology.skeletonize(array_Image > 0)
        self.array_skeleton = npy.array(self.array_skeleton, dtype=npy.uint8)
        (self.row, self.col) = self.array_skeleton.shape
        # ---------------------------------------------------------------------
        # image after taking the skeleton of river
        plt.imshow(self.array_skeleton)
        plt.savefig('temp/10_Skeleton.png', format='jpg', dpi=1200)
        # ---------------------------------------------------------------------

        # 3. Remove dangling arcs ---------------------------------------------
        if not fast:
            print("Removing dangling arcs")
            self.RemoveDanglingArc()
            # -----------------------------------------------------------------
            # image after removing dangling arcs form the skeleton
            plt.imshow(self.array_skeleton)
            plt.savefig('temp/11_RemoveDanglingArc.png',
                        format='jpg',
                        dpi=1200)
            # -----------------------------------------------------------------

        # 4. Find the junction points -----------------------------------------
        print("Identifying all the junctions")
        self.array_done = npy.zeros((self.row, self.col), dtype=npy.int)
        self.array_Junction = npy.zeros((self.row, self.col), dtype=npy.int)
        self.MarkJunctions()
        # ---------------------------------------------------------------------
        # image after finding the junctions of the river
        plt.imshow(self.array_Junction)
        plt.savefig('temp/12_Junctions.png', format='jpg', dpi=1200)
        # ---------------------------------------------------------------------

        # 5. Identify all the reaches -----------------------------------------
        print("Identifying all the reaches")
        reach = Reach(0)  # create a new Reach type variable with ReachID = 0
        self.l_Reach.append(reach)  # append it to the list - l_Reach
        self.IdentifyReach()

        # self.MarkJunctionsAndNeighbourhood()
        # self.MarryReachJunction()
        return

Example #19

0

Show file

File: synonym_retrieval.py Project: clips/conceptualgrounding

    def synonym_retrieval_zeroshot(self,
                                   zeroshot_pairs,
                                   isolated=False,
                                   verbose=False,
                                   outfile=''):

        assert self.train_vectors != None, 'No train vectors are loaded yet!'
        assert self.test_vectors != None, 'No test vectors are loaded yet!'

        # new setting: add ALL zeroshot data to train data to cause more confusion
        train_items = [
            x for _, x in sorted(self.train_vectors.indices.items())
        ]
        train_vectors = self.train_vectors.vectors

        zeroshot_items = set()
        for concept, reference, synonyms in zeroshot_pairs:
            zeroshot_items.add(reference)
            zeroshot_items.update(synonyms)
        zeroshot_items = sorted(zeroshot_items)
        zeroshot_vectors = []
        for zeroshot_item in zeroshot_items:
            zeroshot_vectors.append(self.test_vectors[zeroshot_item])
        if isolated:
            fused_vectors = Reach(zeroshot_vectors, zeroshot_items)
        else:
            all_items = train_items + zeroshot_items
            zeroshot_vectors = np.array(zeroshot_vectors)
            all_vectors = np.concatenate((train_vectors, zeroshot_vectors),
                                         axis=0)
            fused_vectors = Reach(all_vectors, all_items)

        # now rank
        complete_ranking = []
        for instance in tqdm(zeroshot_pairs, disable=False):

            concept, reference, synonyms = instance

            synonym_idxs = [fused_vectors.items[syn] for syn in synonyms]

            reference_idx = fused_vectors.items[reference]

            # calculate distances
            reference_vector = fused_vectors.norm_vectors[reference_idx]
            scores = fused_vectors.norm_vectors.dot(reference_vector.T)

            # extract ranking
            mask = [
                1 if x == reference_idx else 0
                for x in range(len(fused_vectors.items))
            ]
            scores = np.ma.array(scores, mask=mask)
            ranking = np.argsort(-scores)
            ranks = [
                np.where(ranking == synonym_idx)[0][0]
                for synonym_idx in synonym_idxs
            ]
            assert ranks
            ranks, synonyms = zip(*sorted(zip(ranks, synonyms)))
            instance = (concept, reference, synonyms)
            complete_ranking.append((instance, ranks))

        if outfile:
            print('Saving...')
            with open(outfile, 'w') as f:
                json.dump(complete_ranking, f)

        if verbose:
            instances, rankings = zip(*complete_ranking)
            print(round(self.mean_average_precision(rankings), 2), '&',
                  round(self.ranking_accuracy(rankings), 2), '&',
                  round(self.mean_reciprocal_rank(rankings), 2), '&')

        return complete_ranking

Example #20

0

Show file

File: yarn.py Project: strategist922/yarn-2

if __name__ == "__main__":

    import logging
    import time
    import json

    # Setup
    logging.basicConfig(level=logging.INFO)

    umls = "sample_data/umls_sample.json"
    msh = "sample_data/abstracts_example.json"
    path_to_embeddings = ""
    use_subset = False

    embeddings = Reach(path_to_embeddings, header=True, verbose=False)

    logging.info("loaded embeddings.")

    start = time.time()

    y = Yarn(embeddings)

    umls = json.load(open(umls))
    msh = json.load(open(msh))

    if use_subset:

        subset = [
            u'di', u'tat', u'erp', u'ori', u'crna', u'pep', u'de', u'hip',
            u'glycoside', u'sterilization', u'ra', u'don', u'ecg', u'cell',

Example #21

0

Show file

def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)
    diora = trainer.net.diora

    # 1. Get all relevant phrase vectors.
    dtype = {
        'example_ids': 'list',
        'labels': 'list',
        'positions': 'list',
        'sizes': 'list',
        'phrases': 'list',
        'inside': 'torch',
        'outside': 'torch',
    }
    batch_recorder = BatchRecorder(dtype=dtype)
    # Eval mode.
    trainer.net.eval()

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    logger.info('Beginning to embed phrases.')

    strings = []
    with torch.no_grad():
        for i, batch_map in enumerate(batches):
            sentences = batch_map['sentences']
            length = sentences.shape[1]

            # Skips very short examples.
            if length <= 2:
                continue
            strings.extend([
                "".join([idx2word[idx] for idx in x])
                for x in sentences.numpy()
            ])
            trainer.step(batch_map, train=False, compute_loss=False)

            batch_result = {}
            batch_result['inside'] = diora.inside_h[:, -1]
            batch_result['outside'] = diora.outside_h[:, -1]
            batch_recorder.record(**batch_result)

    result = batch_recorder.get_flattened_result()

    # 2. Build an index of nearest neighbors.
    vectors = np.concatenate([result['inside'], result['outside']], axis=1)
    print(len(strings), vectors.shape)
    r = Reach(vectors, strings)

    for s in strings:
        print(s)
        print(r.most_similar(s))

Example #22

0

Show file

File: train-verbs-index.py Project: lukavdplas/dutch-verb-matrices

# stores a list of row numbers and argument strings per verb.

import re
import numpy as np
from reach import Reach
import transformargs

#pathnames
rowspath = './cooccurrence/weighted_sm.rows'
embeddingspath = './tulkens-embeddings/160/sonar-160.txt'
logpath = './failedwords.txt'
exportpath = './verbtrainingindex2'

#import data
rowsfile = open(rowspath, 'r', encoding='utf-8')
r = Reach(embeddingspath, header=True)
#holmatrix = np.load(holmatrixpath)

#load output file
log = open(logpath, 'w', encoding='utf-8')

control = np.zeros(160)
failedcount = 0
rowcount = 590408
t = transformargs.Transformer()
verbarray = np.array(
    ['', np.array([np.array([0, ''], object)], object)], object
)  #will contain line indexes and corresponding argument strings for each verb
#dummy first row added to show structure
verbindex = 0
oldkey = ''

Example #23

0

Show file

File: train-verbs-final.py Project: lukavdplas/dutch-verb-matrices

import sklearn.preprocessing
import math
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import pandas as pd

#pathnames
indexpath = './verbtrainingindex_withweights.npy'
holmatrixpath = './cooccurrence/svd/newmatrix.npy'
embeddingspath = './tulkens-embeddings/160/sonar-160.txt'
outputpath = './verbmatrices/version3'

#import data
index = np.load(indexpath)
holmatrix = np.load(holmatrixpath)
arg_data = Reach(embeddingspath, header=True)

#parameters
n_dim = 160
s_dim = 200
alpha_value = 50
min_sample_size = 400
# note how in testing, a samplesize of N = 500 was deemed acceptable. here we
# do not split in train an test data, so the min sample size can be 80% of the
# one used when testing.
variance_control = True
mean_std = 0.08

matrices = dict()

#loop through verbs