Beispiel #1
0
def get_skipgram(tweets, nIn, kIn):
    #tokenization and preprocess (if not yet done) must be done here. when analyzer receives
    #a callable, it will not perform tokenization, see documentation
    tweet_tokenized = []
    for t in tweets:
        tweet_tokenized.append(nlp.tokenize(t))
    skipper = functools.partial(skipgrams, n=nIn, k=kIn)
    vectorizer = TfidfVectorizer(
        analyzer=skipper,
        #stop_words=nlp.stopwords,  # We do better when we keep stopwords
        use_idf=True,
        smooth_idf=False,
        norm=None,  # Applies l2 norm smoothing
        decode_error='replace',
        max_features=10000,
        min_df=2,
        max_df=0.501)
    # for t in cleaned_tweets:
    #     tweetTokens = word_tokenize(t)
    #     skipgram_feature_matrix.append(list(skipper(tweetTokens)))

    # Fit the text into the vectorizer.
    logger.info("\tgenerating skip-gram vectors, n={}, k={}, {}".format(
        nIn, kIn, datetime.datetime.now()))
    tfidf = vectorizer.fit_transform(tweet_tokenized).toarray()
    logger.info("\t\t complete, dim={}, {}".format(tfidf.shape,
                                                   datetime.datetime.now()))
    vocab = {v: i for i, v in enumerate(vectorizer.get_feature_names())}
    return tfidf, vocab
def select_input_words(sent: str):
    orig_toks = nlp.tokenize(sent, 2)  #keep original
    norm_toks = nlp.tokenize(sent, 1)  #lemmatize
    pos_tags = nlp.get_pos_tags(orig_toks)

    selected = set()
    for i in range(0, len(pos_tags)):
        word = orig_toks[i].lower()
        if word in nlp.stopwords or len(word) < 2:
            continue
        norm = norm_toks[i]

        tag = pos_tags[i]
        if tag in ["NN", "NNS", "NNP", "NNPS"]:
            selected.add(norm)

    return selected
Beispiel #3
0
def text_to_vector_gensim(text,
                          model,
                          text_length,
                          dim,
                          text_norm_option,
                          word_weigts: list = None):
    """
    Given a string, normalizes it, then splits it into words and finally converts
    it to a sequence of word vectors.
    """
    text = nlp.normalize(text)
    words = nlp.tokenize(text, text_norm_option)
    window = words[-text_length:]

    x = np.zeros((text_length, dim))

    random_candidates = [
    ]  # list of word indexes in the embedding model to be randomly chosen
    words_matched = set(
    )  # track words that already found in the embedding model and whose vectors are already used

    for i, word in enumerate(window):
        weight = get_word_weight(word_weigts, word)
        is_in_model = False
        if word in model.wv.vocab.keys():
            is_in_model = True
            vec = model.wv[word]
            vec = vec * weight
            x[i, :] = vec
            words_matched.add(word)

        if not is_in_model:
            if word in GLOBAL_embedding_randomized_vectors.keys():
                vec = GLOBAL_embedding_randomized_vectors[word]
            else:
                if len(GLOBAL_embedding_vocab_indexes) == 0:
                    for n in range(0, len(model.wv.vocab.keys())):
                        GLOBAL_embedding_vocab_indexes.append(n)
                    random.Random(4).shuffle(GLOBAL_embedding_vocab_indexes)

                while (True):
                    index = GLOBAL_embedding_vocab_indexes.pop()
                    word = model.wv.index2word[index]
                    if not word in words_matched:
                        words_matched.add(word)
                        break

                vec = model.wv[word]
                GLOBAL_embedding_randomized_vectors[word] = vec

            vec = vec * weight
            x[i, :] = vec
    return x
Beispiel #4
0
def extract_words(line: str):
    line = str(line).replace("LETTERNUMBER", "")
    line = str(line).replace("NUMBER", "")
    norm_toks = nlp.tokenize(str(line), 1)

    words = []
    for nt in norm_toks:
        word = nt.lower()
        if word in nlp.stopwords or len(word) < 3:
            continue

        words.append(word)
    return words
def find_word_matches(dictionary, target_text, text_normalization_option):
    target_text = nlp.normalize_tweet(target_text)
    norm_toks = set(nlp.tokenize(target_text, text_normalization_option))

    scoresum = 0
    matchsum = 0
    matchmax = 0
    matchbool = 0
    for w, score in dictionary.items():
        score=float(score)
        if w in norm_toks:
            matchbool = 1
            matchsum += 1
            scoresum += score
            if matchmax < score:
                matchmax = score

    return scoresum, matchsum, matchmax, matchbool
Beispiel #6
0
def text_to_vector_fasttext(text,
                            ft_model,
                            text_length,
                            dim,
                            text_norm_option,
                            word_weigts: list = None):
    """
    Given a string, normalizes it, then splits it into words and finally converts
    it to a sequence of word vectors.
    """
    text = nlp.normalize(text)
    words = nlp.tokenize(text, text_norm_option)
    window = words[-text_length:]

    x = np.zeros((text_length, dim))

    for i, word in enumerate(window):
        vec = ft_model.get_word_vector(word).astype('float32')
        weight = get_word_weight(word_weigts, word)
        vec = vec * weight
        x[i, :] = vec

    return x
Beispiel #7
0
def fit_fasttext_holdout(df: DataFrame, split_at_row: int, class_col: int,
                         outfolder: str, task: str, text_norm_option: int,
                         text_input_info: dict, embedding_file: str):
    # X, y, embedding_file, nfold, outfolder: str, task: str):

    encoder = LabelBinarizer()
    y = df[:, class_col]
    print("\ttotal y rows=" + str(len(y)) + " with unique values=" +
          str(len(set(y))))
    print("\tencoding y labels..." + str(datetime.datetime.now()))

    if len(set(y)) > 2:
        y_int = encoder.fit_transform(y)
    else:
        y_int = np.array([[1, 0] if l.strip() == 'CG' else [0, 1] for l in y])

    y_label_lookup = dict()
    y_label_lookup_inverse = dict()
    for index, l in zip(y_int.argmax(1), y):
        y_label_lookup[index] = l
        y_label_lookup_inverse[l] = index
        # print(l+","+str(index))

    X = []
    text_length = 0
    index = 0
    for row in df:
        text = ""
        for b in range(len(text_input_info)):
            info = text_input_info[b]
            t = concate_text(row, info["text_col"])
            t = nlp.normalize(t)
            text_length += int(info["text_length"])
            text += t + " "
        words = nlp.tokenize(text, text_norm_option)
        text = " ".join(words).strip()
        X.append([text])
        index += 1
    X = numpy.asarray(X, dtype=str)

    # perform n-fold validation (we cant use scikit-learn's wrapper as we used Keras functional api above

    X_train_ = X[0:split_at_row]
    y_train_ = y[0:split_at_row]
    X_test_ = X[split_at_row:]
    y_test_ = y[split_at_row:]

    # prepare fasttext data
    fasttext_train = outfolder + "/fasttext_train.tsv"
    with open(fasttext_train, mode='w') as outfile:
        csvwriter = csv.writer(outfile,
                               delimiter='\t',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
        for i in range(len(X_train_)):
            label = y_train_[i]
            text = X_train_[i][0]
            csvwriter.writerow(["__label__" + label.replace(" ", "|"), text])

        # fasttext_test = outfolder + "/fasttext_test.tsv"
        # with open(fasttext_test, mode='w') as outfile:
        #     csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        #     for i in range(len(X_test_)):
        #         label = y_test_[i]
        #         text = X_test_[i][0]
        #         csvwriter.writerow(["__label__" + label, text])

        # -dim 300 -minn 4 -maxn 10 -wordNgrams 3 -neg 10 -loss ns -epoch 3000 -thread 30
    if embedding_file is not None and embedding_file.lower() != 'none':
        model = fasttext.train_supervised(input=fasttext_train,
                                          minn=4,
                                          maxn=10,
                                          wordNgrams=3,
                                          neg=10,
                                          loss='ns',
                                          epoch=3000,
                                          thread=30,
                                          dim=dmc.DNN_EMBEDDING_DIM,
                                          pretrainedVectors=embedding_file)
    else:
        model = fasttext.train_supervised(input=fasttext_train,
                                          minn=4,
                                          maxn=10,
                                          wordNgrams=3,
                                          neg=10,
                                          loss='ns',
                                          epoch=3000,
                                          thread=30,
                                          dim=dmc.DNN_EMBEDDING_DIM)
    # evaluate the model

    X_test_as_list = []
    for row in X_test_:
        X_test_as_list.append(row[0])
    predictions = model.predict(X_test_as_list)[0]

    predicted_labels = []
    for i in predictions:
        label = i[0]
        l = label[9:]
        l = l.replace("|", " ")
        predicted_labels.append(y_label_lookup_inverse[l])

    util.save_scores(predicted_labels, y_int[split_at_row:, :].argmax(1),
                     "dnn", task, "_fasttext_", 3, outfolder)
Beispiel #8
0
def fit_fasttext(df: DataFrame, nfold: int, class_col: int, outfolder: str,
                 task: str, text_norm_option: int, text_input_info: dict,
                 embedding_file: str):
    # X, y, embedding_file, nfold, outfolder: str, task: str):
    print("\t running fasttext using embedding file=" + str(embedding_file))
    encoder = LabelBinarizer()
    y = df[:, class_col]

    y_int = encoder.fit_transform(y)
    y_label_lookup = dict()
    y_label_lookup_inverse = dict()
    for index, l in zip(y_int.argmax(1), y):
        y_label_lookup[index] = l
        y_label_lookup_inverse[l] = index
        # print(l+","+str(index))

    X = []
    text_length = 0
    index = 0
    for row in df:
        text = ""
        for b in range(len(text_input_info)):
            info = text_input_info[b]
            text += concate_text(row, info["text_col"]) + " "
            text_length += int(info["text_length"])
        text = nlp.normalize(text)
        words = nlp.tokenize(text, text_norm_option)
        text = " ".join(words).strip()
        X.append([text])
        index += 1
    X = numpy.asarray(X, dtype=str)

    # perform n-fold validation (we cant use scikit-learn's wrapper as we used Keras functional api above
    kfold = StratifiedKFold(n_splits=nfold,
                            shuffle=True,
                            random_state=cl.RANDOM_STATE)
    splits = list(enumerate(kfold.split(X, y_int.argmax(1))))

    nfold_predictions = dict()
    for k in range(0, len(splits)):
        print("\tnfold=" + str(k))

        # Fit the model
        X_train_index = splits[k][1][0]
        X_test_index = splits[k][1][1]

        X_train_ = X[X_train_index]
        y_train_ = y[X_train_index]
        X_test_ = X[X_test_index]
        y_test_ = y[X_test_index]

        # prepare fasttext data
        fasttext_train = outfolder + "/fasttext_train.tsv"
        with open(fasttext_train, mode='w') as outfile:
            csvwriter = csv.writer(outfile,
                                   delimiter='\t',
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL)
            for i in range(len(X_train_)):
                label = y_train_[i]
                text = X_train_[i][0]
                csvwriter.writerow(
                    ["__label__" + label.replace(" ", "|"), text])

        # fasttext_test = outfolder + "/fasttext_test.tsv"
        # with open(fasttext_test, mode='w') as outfile:
        #     csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        #     for i in range(len(X_test_)):
        #         label = y_test_[i]
        #         text = X_test_[i][0]
        #         csvwriter.writerow(["__label__" + label, text])

        # -dim 300 -minn 4 -maxn 10 -wordNgrams 3 -neg 10 -loss ns -epoch 3000 -thread 30
        if embedding_file is not None:
            model = fasttext.train_supervised(input=fasttext_train,
                                              minn=4,
                                              maxn=10,
                                              wordNgrams=3,
                                              neg=10,
                                              loss='ns',
                                              epoch=3000,
                                              thread=30,
                                              dim=dmc.DNN_EMBEDDING_DIM,
                                              pretrainedVectors=embedding_file)
        else:
            model = fasttext.train_supervised(input=fasttext_train,
                                              minn=4,
                                              maxn=10,
                                              wordNgrams=3,
                                              neg=10,
                                              loss='ns',
                                              epoch=3000,
                                              thread=30,
                                              dim=dmc.DNN_EMBEDDING_DIM)

        # evaluate the model
        X_test_as_list = []
        for row in X_test_:
            X_test_as_list.append(row[0])
        predictions = model.predict(X_test_as_list)[0]

        for i in range(len(X_test_index)):
            index = X_test_index[i]
            label = predictions[i][0]
            l = label[9:]
            l = l.replace("|", " ")
            nfold_predictions[index] = y_label_lookup_inverse[l]

    indexes = sorted(list(nfold_predictions.keys()))
    predicted_labels = []
    for i in indexes:
        predicted_labels.append(nfold_predictions[i])

    util.save_scores(predicted_labels, y_int.argmax(1), "dnn", task,
                     "_fasttext_", 3, outfolder)
Beispiel #9
0
def extract_dict(label_to_proftext: dict):
    # frequency based score
    label_vocab_to_totalfreq = dict()
    vocab_overall_frequency = dict()

    label_to_nouns = dict()
    label_to_verbs = dict()

    for label, texts in label_to_proftext.items():
        print(label + "," + str(len(texts)))
        vocab_score = dict()

        # identify verbs and nouns for this label
        nouns = set()
        verbs = set()
        for t in texts:
            #count+=1
            #print(count)
            orig_toks = nlp.tokenize(t, 2)
            stem_toks = nlp.tokenize(t, text_normalization_option)
            pos_tags = nlp.get_pos_tags(orig_toks)
            for i in range(0, len(pos_tags)):
                word = orig_toks[i].lower()
                if word in nlp.stopwords or len(word) < 2:
                    continue
                stem = stem_toks[i]
                if stem in vocab_score.keys():
                    vocab_score[stem] += 1
                else:
                    vocab_score[stem] = 1

                tag = pos_tags[i]
                if tag in ["NN", "NNS", "NNP", "NNPS"]:
                    nouns.add(stem_toks[i])
                elif tag in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
                    verbs.add(stem_toks[i])
            label_to_nouns[label] = nouns
            label_to_verbs[label] = verbs

        label_vocab_to_totalfreq[label] = vocab_score
        for e, frequency in vocab_score.items():
            if frequency == 0:
                continue
            if e in vocab_overall_frequency.keys():
                vocab_overall_frequency[e] += frequency
            else:
                vocab_overall_frequency[e] = frequency

    # calculate weighted score
    label_vocab_to_weightedscore = dict()
    for label, vocab_freq in label_vocab_to_totalfreq.items():
        vocab_score = dict()
        for e, frequency in vocab_freq.items():
            if e not in vocab_overall_frequency.keys():
                continue
            totalfreq = vocab_overall_frequency[e]
            s = frequency / totalfreq
            if s == 1.0:
                continue
            vocab_score[e] = s
        label_vocab_to_weightedscore[label] = vocab_score

    return label_vocab_to_totalfreq, label_vocab_to_weightedscore, label_to_nouns, label_to_verbs