Beispiel #1
0
def update_model():
    model_ref = models_collection.document("model")
    if not model_ref.get().exists:
        initial_model = models_collection.document(
            "initial_model").get().to_dict()
        model_ref.set({"json_string": initial_model["json_string"]})

    todos = todos_collection.stream()
    to_add = []

    for t in todos:
        t_id = t.id
        text = t.to_dict()["text"]

        # Add some punctuation if not present to help separate different TODOs.
        if text[-1] not in string.punctuation:
            text += "."

        to_add.append(text)
        todos_collection.document(t_id).delete()

    if to_add:
        to_merge = TextModel("\t".join(to_add))

        serialized_model = model_ref.get().to_dict()["json_string"]
        old_model = TextModel.from_json(serialized_model)

        new_model = markovify.combine([old_model, to_merge])
        model_ref.set({"json_string": new_model.to_json()})
Beispiel #2
0
import firebase_admin
import markovify
from firebase_admin import credentials, firestore

from text_model import TextModel

creds = credentials.Certificate("./firestore-key.json")
firebase_admin.initialize_app(creds)
db = firestore.client()

with open("./issue_titles.txt") as f:
    text = f.read()

model = TextModel(text)
db.collection("models").document("initial_model").set({"json_string": model.to_json()})
Beispiel #3
0
def update_model(todo_text=None):
    to_combine = TextModel(todo_text)
    session_state.model = markovify.combine([session_state.model, to_combine])
Beispiel #4
0
def finished_tasks(col):
    col.write("### My Finished Tasks")
    render_tasks_and_buttons(
        column=col,
        tasks=session_state.my_finished_tasks,
        button_label="Remove",
        button_action=remove_finished_task,
    )


placeholder = st.empty()
if not session_state.model:
    placeholder.warning("Initializing. Please wait...")
    with open("./issue_titles.txt") as f:
        text = f.read()
    session_state.model = TextModel(text)

placeholder.empty()


st.write("# TODOs and Stuff")

with st.beta_form(submit_label="Submit", key="submit_form"):
    input_placeholder = st.empty()
    todo_text = input_placeholder.text_input(
        "Add a TODO!", key=session_state.input_key
    ).strip()
    share_me = st.checkbox("Help improve our TODO suggestions?", value=True)

    if todo_text:
        session_state.input_key += 1
Beispiel #5
0
        slice_s1 = slice_s1.permute(3, 2, 1, 0)
        q_out = q.gather(2, slice_s1).squeeze(2)

        slice_s2 = S2.long().expand(1, config.l_q, q.size(0))
        slice_s2 = slice_s2.permute(2, 1, 0)
        q_out = q_out.gather(2, slice_s2).squeeze(2)

        logits = self.fc(q_out)
        return logits, self.sm(logits), v, r, heatmap

if __name__ == '__main__':
    #create object embedding model
    objects = 10
    obj_embed = 7
    object_model = LookupModel(objects, embed_dim=obj_embed)

    #text lstm model
    attn_kernel = 3
    attn_in = obj_embed
    attn_out = 1 # no. of heatmap channels
    lstm_out = (attn_kernel**2) * attn_in * attn_out
    vocab_size = 300
    instruction_model = TextModel(vocab_size, ninp=15,
                                nhid=30, nlayers=1,
                                out_dim=lstm_out)
    heatmap_model = AttentionHeatmap(instruction_model,
                                    attn_kernel,
                                    attn_in,
                                    attn_out)

Beispiel #6
0
from text_model import TextModel


# load configuration
default_config_folder = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(__file__))), 'configs')
default_config_file = os.path.join(default_config_folder, 'text_classification_inference.ini')
cfg = config.load_config(sys.argv[1] if len(sys.argv) > 1 else default_config_file)


logging.info('---------------------------------------------------')
logging.info('-------- Classification module (inference) --------')
logging.info('---------------------------------------------------')


# ---------------------------- Load model --------------------------- #
classification_model = TextModel(cfg)

logging.info('Loading trained classification model...')
name, model, feature, trasformer, embeddings, label_encoder = classification_model.load()


# ---------------------------- Load data ---------------------------- #
data_loader = TextDataLoader(cfg, label_encoder)

logging.info('Loading data to be classified...')
inference_data, invalid_id = data_loader.load_classification_data('inference')


# ----------------------- Extract features -------------------------- #
feature_extractor = TextFeatureExtractor(cfg, embeddings=embeddings)
        test_x, trans_tfidf_ngram)
    test_tfidf_char = feature_extractor.get_tfidf_features(
        test_x, trans_tfidf_char)

# word embeddings
if cfg.features.word_embeddings:
    logging.info('Computing word embeddings...')

    # compute transformers to embedding space
    trans_emb = feature_extractor.get_embedding_transformer()
    train_emb = feature_extractor.get_embedding_features(train_x, trans_emb)
    valid_emb = feature_extractor.get_embedding_features(valid_x, trans_emb)
    test_emb = feature_extractor.get_embedding_features(test_x, trans_emb)

# -------------------- Train classifiers ---------------------------- #
classification_model = TextModel(cfg, data_loader.df)
models = dict()

if cfg.model.train_nb:
    logging.info('Training Naive Bayes classifier...')

    # train Naive Bayes on Bag-of-Words features
    if cfg.features.bow:
        models["NB_BOW"] = [
            classification_model.train('NB', train_bow, train_y), 'bow'
        ]

    # train Naive Bayes on TF-IDF features of different levels (word, ngram, and char)
    if cfg.features.tfidf:
        models["NB_TF-IDF_WORD"] = [
            classification_model.train('NB', train_tfidf_word, train_y),
Beispiel #8
0
        assert self.word2code is not None
        begin = stdtime()
        out = []
        for c in text:
            out.append(self.word2code[c])

        out = ''.join(out)
        compress_rate = len(out) / len(text)
        time = stdtime() - begin
        print('Compress time: {:.2f}s, compression rate: {}'.format(
            time, compress_rate))
        return out

    def fit_compress(self, text):
        self.fit_text(text)
        return self.compress(text)


if __name__ == '__main__':
    from text_model import TextModel
    model = TextModel(['a', 'b', 'c', 'd'], [1, 5, 3, 7])
    print(model.entropy())
    c1 = Compressor()
    c2 = Compressor()
    c1.fit_textmodel(model)
    text = model.random_text(1000)
    out1 = c1.compress(text)
    print(c1.word2code)
    out2 = c2.fit_compress(text)
    print(c2.word2code)
Beispiel #9
0
# ----------------------- Reduce dimensionality ----------------------- #
if cfg.features.lsa:
    logging.info('Reducing dimensionality...')

    # Bag-of-Words
    if cfg.features.bow:
        data_bow_lsa = feature_extractor.lsa(data_bow)

    # TF-IDF
    if cfg.features.tfidf:
        data_tfidf_word_lsa = feature_extractor.lsa(data_tfidf_word)
        data_tfidf_ngram_lsa = feature_extractor.lsa(data_tfidf_ngram)
        data_tfidf_char_lsa = feature_extractor.lsa(data_tfidf_char)

# ----------------------- Compute topics (LDA) ------------------------ #
clustering_model = TextModel(cfg, data_loader.df)

if cfg.model.lda:
    logging.info('Latent Dirichlet Allocation...')

    # compute LDA on Bag-of-Words features
    if cfg.features.bow:
        _, data_bow_lda = clustering_model.lda(data_bow)

    # compute LDA on TF-IDF features (words and n-grams only)
    if cfg.features.tfidf:
        _, data_tfidf_word_lda = clustering_model.lda(data_tfidf_word)
        _, data_tfidf_ngram_lda = clustering_model.lda(data_tfidf_ngram)

# ------------------------- Compute clusters ------------------------- #
models = dict()