Beispiel #1
0
    def cascade(self, instance):
        """Cascade the index from this instance to others that depend on it.

        This causes index_instance() to be called on each instance that depends
        on the instance supplied.

        """
        for descriptor in self.cascades:
            cascade_inst = None
            # find the instance we're being told to cascade the reindex onto
            try:
                if callable(descriptor):
                    cascade_inst = descriptor(instance)
                elif isinstance(descriptor, str):
                    cascade_inst = getattr(instance, descriptor)
            except:
                cascade_inst = None
            # if we found one, check if it's searchable, check if it
            # wants to accept the cascade, and if so, reindex it
            if cascade_inst:
                # If it's not an iterable already, make it into one
                if not hasattr(cascade_inst, '__iter__'):
                    cascade_insts = [cascade_inst]
                else:
                    cascade_insts = cascade_inst
                for cascade_inst in cascade_insts:
                    indexer = get_indexer(cascade_inst)
                    if indexer and indexer.reindex_on_cascade(instance, cascade_inst):
                        indexer.index_instance(cascade_inst, with_cascade=False)
Beispiel #2
0
def main():

    train_data = get_train_data_from_csv('data/train_15_dns.csv')
    #shuffle(train_data)
    train_data = train_data[0:50000]

    dev_data = get_dev_data_from_csv('data/dev_15_dns.csv')
    #shuffle(dev_data)
    dev_data = dev_data[0:10000]

    print('len of training data:', len(train_data))
    print('len of dev data:', len(dev_data))

    vocab_size = get_vocab_size(train_data)
    print('calculated vocab size:', vocab_size)

    indexer = get_indexer('indexer_15_dups.csv')

    model = train_rnn_classifier(train_data, vocab_size, indexer)

    print_evaluation(dev_data, model)
Beispiel #3
0
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import json
import pandas as pd
from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import classification_report

include_test = True

tknr = TweetTokenizer()
indexer = get_indexer('indexer_15_dups.csv')
word_indexer = Indexer()
word_indexer.add_and_get_index("UNK")

train_data = get_train_data_from_csv('data/train_15_ds.csv')[0:1000]
dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')[:200]
test_data = get_test_data_from_csv('data/test_15_ds.csv')[0:200]

X_train = []
Y_train = []
X_dev = []
Y_dev = []
Y_dev_true = []
X_test = []
Y_test = []
Y_test_true = []
Beispiel #4
0
def reindex_index(indexname, suffix):
    """Reindex a named index.
    """
    
    if not hasattr(settings, 'ENABLE_SEARCHIFY') or not settings.ENABLE_SEARCHIFY:
        return
    
    models = _index_models.get(indexname, None)
    if models is None:
        raise KeyError("Index %r is not known" % indexname)
    try:

        # Get the index-wide settings.
        index_settings = {}
        def merge_dicts(path, a, b):
            for (k, v) in b.iteritems():
                if k not in a:
                    a[k] = v
                    continue
                if isinstance(v, dict):
                    merge_dicts('%s.%s' % (path, k), a[k], v)
                    continue
                if a[k] == v:
                    continue
                raise ValueError("Conflicting values in index_settings (at %s)" % path[1:])
        for model in models:
            indexer = get_indexer(model)
            merge_dicts('.', index_settings, indexer.index_settings)

        created = False
        for model in models:
            print "Indexing %s to %s, using suffix %s" % (model, indexname, suffix)
            indexer = get_indexer(model)
            try:
                indexer.client.set_suffix(suffix)
                if not created:
                    #print "Creating index with settings %r" % index_settings
                    indexer.client.create_index(index_settings)
                    created = True
                indexer.apply_mapping()
                indexer.index_all(with_cascade=False)
            finally:
                indexer.client.set_suffix()
            indexer.client.flush()

        # Get the old value of the alias.
        try:
            old_index = client.get_alias(indexname)[0]
        except IndexError:
            old_index = None
        if old_index == indexname:
            # Old index wasn't an alias; we have to delete it and then set the
            # new alias for it.
            print "Warning: no alias in use, so must delete in-use index"
            old_index = None
            client.delete_index(indexname)
        print "Setting alias to make new index live"
        client.set_alias(indexname, indexname + suffix)
    except:
        try:
            client.delete_index(indexname + suffix)
        except Exception:
            # Ignore any normal exceptions, so we report the original error.
            pass
        raise
    if old_index:
        print "Removing old index: %s" % old_index
        client.delete_index(old_index)
Beispiel #5
0
        self.indexer = Indexer()

    def get_indexer(self):
        return self.indexer

    def extract_features(self, ex):
        feature_vector = np.zeros(len(self.indexer))
        for word in ex.text:
            index = self.indexer.index_of(word)
            feature_vector[index] += 1
        return feature_vector


filename = sys.argv[1]

indexer = get_indexer('data/indexer_' + filename)
train_set = get_train_data_from_csv('data/train_' + filename)
dev_set = get_dev_data_from_csv('data/dev_' + filename)
test_set = get_test_data_from_csv('data/test_' + filename)

p = PerceptronClassifier(indexer, FeatureExtractor())
p.train(train_set)

y_pred = []
y_true = []
for ex in dev_set:
    y_true.append(ex.label)
    y_pred.append(p.predict(ex))

print("Dev Set Results: ")
print("Accuracy: ", accuracy_score(y_true, y_pred))