def get_tf_idf_model(citations=None):
    if citations is None:
        citations = TextPreprocessor()
        citations.preprocess()

    documents = [
        citation['title'] + ' \n' + citation['abstract']
        for citation in list(citations.values())
    ]
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
    bigrams = bigram_vectorizer.fit_transform(documents)

    tfidf = TfidfTransformer().fit_transform(bigrams)

    return citations, bigram_vectorizer, tfidf
    for token_sequence in sequences:
        words.extend(token_sequence)

    word_counts = dict(Counter(words).most_common(max_words))

    most_common_words = list(word_counts.keys())
    word_ids = list(range(len(most_common_words)))

    vocabulary = dict(zip(most_common_words, word_ids))
    return vocabulary


sentences = np.genfromtxt('./tickets_QIT.txt', dtype=str, delimiter='\n')

prep = TextPreprocessor(sentences)
prep = QITEmailBodyCleaner(prep)
prep = Tokenizer(prep, language='italian')
tokens = prep.preprocess()
vocabulary = build_vocabulary(tokens)

unknown_token_id = max(vocabulary.values()) + 1
prep = IntegerEncoder(prep, vocabulary, unknown_token_id)
prep = WordContextPairsGenerator(prep, window_length=2)

word_context_pairs = prep.preprocess()
target_words = [tw for (tw, cw) in word_context_pairs]
context_words = [cw for (tw, cw) in word_context_pairs]

np.savetxt('target_words.txt', target_words, fmt='%d')
np.savetxt('context_words.txt', context_words, fmt='%d')
Exemple #3
0
prep = Tokenizer(prep, language)

# Load vocabulary
with open('vocabulary_wikipedia', 'r') as vocabulary_file:
    vocabulary = eval(vocabulary_file.read())

# Add integer encoding decorator
unknown_token_id = max(vocabulary.values()) + 1
prep = IntegerEncoder(prep, vocabulary, unknown_token_id)

# Add padding decorator
padding_token_id = max(vocabulary.values()) + 2
prep = Padder(prep, padding_token_id, max_length)

# Get final tokens
final_tokens = prep.preprocess()

# Load labels
labels = np.genfromtxt('../upsampled/y_QIT.txt', delimiter='\n',
                       dtype=str).reshape((-1, 1))

# Convert labels into one-hot dummies
enc = OneHotEncoder(sparse=False)
one_hot_labels = enc.fit_transform(labels)

# Split dataset into training and test data
x_train, x_test, y_train, y_test = train_test_split(final_tokens,
                                                    one_hot_labels,
                                                    test_size=0.3,
                                                    stratify=labels)