Example #1
0
 def __init__(self, path, encoding='latin1'):
     self.dict = {
         hash_string(u"#re"): [],
         hash_string(u"#lvl2"): [],
         hash_string(u"#lvl3"): []
     }
     self.index = []
     self.array = []
     self.add_dict(path, encoding)
Example #2
0
    def Eg(self, text, opt=None, label=None):
        eg = self._eg
        eg.reset()

        doc = self.nlp(text)

        features = []
        word_types = set()
        i = 0
        for token in doc[:-1]:
            next_token = doc[i + 1]

            strings = (token.lower_, next_token.lower_)
            key = hash_string('%s_%s' % strings)
            feat_slot = 0
            feat_value = 1
            features.append((0, token.lower, 1))
            features.append((feat_slot, key, feat_value))
            i += 1

        eg.features = features
        if opt is not None:
            eg.is_valid = [(clas in opt) for clas in range(self.nr_class)]
        if label is not None:
            eg.costs = [clas != label for clas in range(self.nr_class)]
        return eg
Example #3
0
 def bucketize(self):
     self.table = [set() for i in range(self.N)]
     for (i, words) in enumerate(
             tqdm(self.transformer.inverse_transform(self.vectors))):
         for w in words:
             h = hash_string(str(w))
             self.table[h % self.N].add(i)
Example #4
0
File: hn.py Project: icorecool/CHN
def get_word_vector(w):
    h = hash_string(w.lower())
    i = _vectors.key2row.get(h, 0)
    # i = i if len(_vectors.data) > i else 0:
    if len(_vectors.data) > i:
        return _vectors.data[i]
    return np.zeros(_vector_size)
Example #5
0
def test_get_vector(strings, data):
    v = Vectors(data=data)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)
    assert list(v[strings[0]]) == list(data[0])
    assert list(v[strings[0]]) != list(data[1])
    assert list(v[strings[1]]) != list(data[0])
Example #6
0
def test_get_vector(strings, data):
    v = Vectors(data=data)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)
    assert list(v[strings[0]]) == list(data[0])
    assert list(v[strings[0]]) != list(data[1])
    assert list(v[strings[1]]) != list(data[0])
Example #7
0
def serialize_doc(doc):
    doc_byte_string = doc.to_bytes()
    # doc_user_data_string = "" if len(doc.user_data) == 0 else pickle.dumps(doc.user_data, pickle.HIGHEST_PROTOCOL)
    value = {
        _DOC_BYTE_STRING: str(doc_byte_string),
        _USER_DATA: doc.user_data,
        _HASH: str(hash_string(doc.string))
    }
    return pickle.dumps(value, pickle.HIGHEST_PROTOCOL).encode('base64')
def main(patterns_loc, text_loc, counts_loc, n=10000000):
    nlp = English(parser=False, tagger=False, entity=False)
    print("Make matcher")
    phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
    counts = PreshCounter()
    t1 = time.time()
    for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
        counts.inc(hash_string(mwe.text), 1)
    t2 = time.time()
    print("10m tokens in %d s" % (t2 - t1))

    with codecs.open(counts_loc, 'w', 'utf8') as file_:
        for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
            text = phrase.string
            key = hash_string(text)
            count = counts[key]
            if count != 0:
                file_.write('%d\t%s\n' % (count, text))
Example #9
0
def main(patterns_loc, text_loc, counts_loc, n=10000000):
    nlp = English(parser=False, tagger=False, entity=False)
    print("Make matcher")
    phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
    counts = PreshCounter()
    t1 = time.time()
    for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
        counts.inc(hash_string(mwe.text), 1)
    t2 = time.time()
    print("10m tokens in %d s" % (t2 - t1))
    
    with codecs.open(counts_loc, 'w', 'utf8') as file_:
        for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
            text = phrase.string
            key = hash_string(text)
            count = counts[key]
            if count != 0:
                file_.write('%d\t%s\n' % (count, text))
Example #10
0
def get_matches(matcher, pattern_ids, doc):
    matches = []
    for label, start, end in matcher(doc):
        candidate = doc[start : end]
        if pattern_ids[hash_string(candidate.text)] == True:
            start = candidate[0].idx
            end = candidate[-1].idx + len(candidate[-1])
            matches.append((start, end, candidate.root.tag_, candidate.text))
    return matches
Example #11
0
def unserialize_doc(nlp, serialized_string):
    value = pickle.loads(serialized_string.decode('base64'))
    doc_byte_string = value[_DOC_BYTE_STRING]
    user_data = value[_USER_DATA]
    doc_hash = value[_HASH]
    doc = Doc(nlp.vocab).from_bytes(doc_byte_string)
    assert str(hash_string(
        doc.string)) == doc_hash, "the hash doesn't match the hash"
    doc.user_data = user_data
    return doc
Example #12
0
def test_set_vector(strings, data):
    orig = data.copy()
    v = Vectors(data=data)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)
    assert list(v[strings[0]]) == list(orig[0])
    assert list(v[strings[0]]) != list(orig[1])
    v[strings[0]] = data[1]
    assert list(v[strings[0]]) == list(orig[1])
    assert list(v[strings[0]]) != list(orig[0])
Example #13
0
def test_get_vector_resize(strings, data, resize_data):
    v = Vectors(data=data)
    v.resize(shape=resize_data.shape)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)

    assert list(v[strings[0]]) == list(resize_data[0])
    assert list(v[strings[0]]) != list(resize_data[1])
    assert list(v[strings[1]]) != list(resize_data[0])
    assert list(v[strings[1]]) == list(resize_data[1])
Example #14
0
def test_set_vector(strings, data):
    orig = data.copy()
    v = Vectors(data=data)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)
    assert list(v[strings[0]]) == list(orig[0])
    assert list(v[strings[0]]) != list(orig[1])
    v[strings[0]] = data[1]
    assert list(v[strings[0]]) == list(orig[1])
    assert list(v[strings[0]]) != list(orig[0])
Example #15
0
 def get_entry(self, name):
     'Get dictionary entry. Returns entries mapped to name\'s lemma. If the word doesn\'t exists returns an empty list.'
     name = name if name[0] == "#" else self.get_lemma(name)
     hash = hash_string(name)
     try:
         return self.array[list_index(self.index, hash)]
     except ValueError:
         try:
             return self.dict[hash]
         except KeyError:
             self.dict[hash] = []
             return self.dict[hash]
Example #16
0
def test_vectors_clear():
    data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
    v = Vectors(data=data, keys=["A", "B", "C"])
    assert v.is_full is True
    assert hash_string("A") in v
    v.clear()
    # no keys
    assert v.key2row == {}
    assert list(v) == []
    assert v.is_full is False
    assert "A" not in v
    with pytest.raises(KeyError):
        v["A"]
Example #17
0
    def count_doc(self, words):
        # Get counts for this document
        doc_counts = PreshCounter()
        doc_strings = {}
        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word
 
        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            # TODO: Why doesn't inc return this? =/
            corpus_count = self.counts[key]
            # Remember the string when we exceed min count
            if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq:
                 self.strings[key] = doc_strings[key]
            n += count
        return n
Example #18
0
    def count_doc(self, words):
        # Get counts for this document
        doc_counts = PreshCounter()
        doc_strings = {}
        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word

        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            # TODO: Why doesn't inc return this? =/
            corpus_count = self.counts[key]
            # Remember the string when we exceed min count
            if corpus_count >= self.min_freq and (corpus_count -
                                                  count) < self.min_freq:
                self.strings[key] = doc_strings[key]
            n += count
        return n
    def count_doc(self, words):
        doc_counts = PreshCounter()
        doc_strings = {}

        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word

        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            corpus_count = self.counts[key]

            if corpus_count >= self.min_freq and (corpus_count -
                                                  count) < self.min_freq:
                self.strings[key] = doc_strings[key]

            n += count

        return n
Example #20
0
def test_get_vector_resize(strings, data):
    strings = [hash_string(s) for s in strings]

    # decrease vector dimension (truncate)
    v = Vectors(data=data)
    resized_dim = v.shape[1] - 1
    v.resize(shape=(v.shape[0], resized_dim))
    for i, string in enumerate(strings):
        v.add(string, row=i)

    assert list(v[strings[0]]) == list(data[0, :resized_dim])
    assert list(v[strings[1]]) == list(data[1, :resized_dim])

    # increase vector dimension (pad with zeros)
    v = Vectors(data=data)
    resized_dim = v.shape[1] + 1
    v.resize(shape=(v.shape[0], resized_dim))
    for i, string in enumerate(strings):
        v.add(string, row=i)

    assert list(v[strings[0]]) == list(data[0]) + [0]
    assert list(v[strings[1]]) == list(data[1]) + [0]
Example #21
0
def main():
    nlp = English(parser=False, tagger=False, entity=False)

    gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones']
    example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.'
    pattern_ids = PreshMap()
    max_length = 0
    for pattern_str in gazetteer:
        pattern = nlp.tokenizer(pattern_str)
        bilou_tags = get_bilou(len(pattern))
        for word, tag in zip(pattern, bilou_tags):
            lexeme = nlp.vocab[word.orth]
            lexeme.set_flag(tag, True)
        pattern_ids[hash_string(pattern.text)] = True
        max_length = max(max_length, len(pattern))

    matcher = make_matcher(nlp.vocab, max_length)

    doc = nlp(example_text)
    matches = get_matches(matcher, pattern_ids, doc)
    merge_matches(doc, matches)
    for token in doc:
        print(token.text, token.ent_type_)
Example #22
0
def add_sentences(source):
    # Create sentences from scrapped items

    for doc in scrapdb[source].find():
        for key, value in doc.items():
            if key not in field2sent:
                continue

            for text in list(value):
                try:
                    traindb['sentence'].insert({
                        '_id':
                        'S{}'.format(str(hash_string(text.lower()))),
                        'language':
                        doc['language'],
                        'source':
                        doc['isPartOf'],
                        'subject':
                        'cookery',
                        'text':
                        text.lower()
                    })
                except pymongo.errors.DuplicateKeyError:
                    continue
Example #23
0
 def contexts_by_entities(self, doc):
     """Returns a set of document ids that *might* be related to named entities in the pre-processed question"""
     ents = self.doc_entities(doc)
     buckets = [hash_string(word) % self.N for word in ents]
     return set([doc_id for slot in buckets for doc_id in self.table[slot]])
Example #24
0
 def hash(self):
     string = self.value
     return hash_string(string)