Python TfidfVectorizer.get_stop_wordsの例、sklearn.feature_extraction.text.TfidfVectorizer.get_stop_words Pythonの例

コード例 #1

1

ファイルを表示

def test_tfidf_vectorizer():
    '''
    停用词就是在分类中没有用的词，这些词一般词频 TF 高，但是 IDF 很低，起不到分类的作用。
    为了节省空间和计算时间，我们把这些词作为停用词 stop words，告诉机器这些词不需要帮我计算
    TfidfVectorizer
    stop_words list
    token_pattern 过滤规则 正则表达式
    fit_transform后
    vocabulary_ 词汇表 字典型
    idf_ 返回idf值
    stop_words_ 返回停用词表
    :return:
    '''
    tfidf_vec = TfidfVectorizer()
    print(tfidf_vec)
    documents = [
        'this is the bayes document',
        'this is the second document',
        'and the third one',
        'is this the document'
    ]
    tfidf_matrix = tfidf_vec.fit_transform(documents)
    print(tfidf_vec.get_feature_names())
    print(tfidf_vec.get_stop_words())
    print(tfidf_vec.get_params())
    print(tfidf_vec.vocabulary_)
    print(tfidf_matrix.toarray())

コード例 #2

0

ファイルを表示

ファイル: app.py プロジェクト: OpenSenseLabs/nlp-tools

def calculate_wight(entities, text):
    weight = dict()
    #print entities
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 10))
    tfidf_mat = vectorizer.fit_transform([text])
    stop_word = vectorizer.get_stop_words()
    #entity_type = extract_ne_type(entities,text,stop_word)
    #print entity_type
    for word, w8 in zip(vectorizer.get_feature_names(),
                        tfidf_mat.toarray().tolist()[0]):
        weight[word] = w8
    spcl_c = [
        '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.',
        '/', ':', ';', '<', '=', '>', '?', '@', '[', '\'', "]", "^", '_', "`",
        "{", "|", "}", "~"
    ]
    tpl_list = set()
    for word in entities:
        if word.lower() not in stop_word:
            ss = word.lower()
            for ch in spcl_c:
                ss = ss.replace(ch, ' ')
            ss = ' '.join([
                w.strip() for w in ss.split(' ')
                if len(w.strip()) > 1 and w.strip() not in stop_word
            ])
            try:
                tpl_list.add(
                    (word.lower(), weight[ss]))  #, entity_type[word]))
            except KeyError as e:
                continue
    return tpl_list

コード例 #3

0

ファイルを表示

ファイル: eg_tfidf_vectorizer_nltk.py プロジェクト: sohcalvin/ref

def describeTfid():
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                       max_features=200000,
                                       min_df=0.2,
                                       stop_words='english',
                                       use_idf=True,
                                       tokenizer=tokenize_and_stem,
                                       ngram_range=(1, 2))

    in_text = [
        "Tommy is a boy", "Mary is Tommy sister", "Maggie is both their friend"
    ]
    text_transformed = tfidf_vectorizer.fit_transform(
        in_text)  # This is actually
    # tfidf_vectorizer.fit(in_text)
    # text_transformed = tfidf_vectorizer.transform(in_text)

    feature_names = tfidf_vectorizer.get_feature_names()

    print("In Text          = ", in_text)
    print("Text Transformed(", type(text_transformed), " of shape ",
          text_transformed.shape, ") =\n", text_transformed)
    print("Vocab = ", tfidf_vectorizer.vocabulary_)
    print("Feature names    = ", feature_names)
    print("Stop words \n", tfidf_vectorizer.get_stop_words())

コード例 #4

0

ファイルを表示

def document_tfid_parser(documents):
    # So we want to parse one single document

    # Het werkt niet met alle documenten
    # vectorizer = TfidfVectorizer()
    # X = vectorizer.fit_transform(documents)
    # print(vectorizer.get_feature_names())
    # print(vectorizer.get_params())
    # print(vectorizer.get_stop_words())
    # print(X)
    # print(X.shape)
    sumOfDocuments = []
    for document in documents:
        sumOfDocuments.append(str(document[0]).replace('_',' '))
        # if len(document[0]) > 4:
        #     vectoriser = TfidfVectorizer()
        #     X = vectoriser.fit_transform(document)
        #     print(vectoriser.get_feature_names())
        #     print(vectoriser.get_params())
        #     print(vectoriser.get_stop_words())
        #     print(X.shape)
        #     print(X)
        # else:
        #     pass
    if len(sumOfDocuments) > 4:
        vectoriser = TfidfVectorizer(max_df=0.7)
        X = vectoriser.fit_transform(sumOfDocuments)
        print(vectoriser.get_feature_names())
        print(vectoriser.get_params())
        print(vectoriser.get_stop_words())
        print(X.shape)
        print(X)
    else:
        pass

コード例 #5

0

ファイルを表示

ファイル: Practica3.py プロジェクト: anuf/txine

def vectorizaCorpus(corpus, minDf):
    ''' Vectoriza o corpus introducido filtrando as palabras que aparecen en 
    menos de minDf documentos'''
    try:
        vectorizer = TfidfVectorizer(min_df = minDf, lowercase=True, stop_words='english')
        
        # Definimos unha lista propoia de stopwords
        myStopwords = ['did','didn','does','doesn','don','just','isn', \
        'reddit', 'wasn','www','yeah','yes','like','able','thanks', \
        'know', 'think','ve', 'want','com','https','http',\
        'good', 'really', 'make', 'say', 'going', 'said', 'people','way', \
        'use']
        
        # engadimos as stop_words que queremos ao conxunto xa existente
        vectorizer.stop_words = vectorizer.get_stop_words().union(myStopwords)
        
        # calculamos a matriz de documentos-términos
        docTerms = vectorizer.fit_transform(corpus)
        
        # invertimos o vocabulario creando un diccionario de índices - termos
        invVoc = {v: k for k, v in vectorizer.vocabulary_.items()} 
        
        # buscamos os termos centrais, que son os que a suma acumulada de tf/idf en todos os documentos é maior
        sumaTfidf = docTerms.sum(axis=0).tolist()[0] #calculamos a suma por columnas da matriz de documentos-termos
    
        return  vectorizer, invVoc, sumaTfidf
    except Exception as e:
        print('\nOcorreu un problema: {0}'.format(e))
        sys.exit()

コード例 #6

0

ファイルを表示

ファイル: tf_idf_model.py プロジェクト: Cawinchan/kaggletoxic

def tf_idf_vectorizer_big(list_of_strings,
                          choose_to_log_data=True,
                          log_vectorised_words=False,
                          logger=None):
    """
    function should return tf-idf logistic regression score
    :param : list
    :type : string
    :return: sparse matrix
    :rtype: value
    """
    search_and_replace_numerals_with_space = lambda x: re.sub(
        r'(\d[\d\.])+', '', x.lower())
    vect_char = TfidfVectorizer(
        preprocessor=search_and_replace_numerals_with_space,
        stop_words='english',
        analyzer='char',
        ngram_range=(2, 6),
        min_df=20)
    vect_word = TfidfVectorizer(
        preprocessor=search_and_replace_numerals_with_space,
        stop_words='english',
        min_df=20)
    sparse_matrix_word = vect_word.fit_transform(list_of_strings)
    sparse_matrix_char = vect_char.fit_transform(list_of_strings)
    sparse_matrix_combined = sparse.hstack(
        [sparse_matrix_word, sparse_matrix_char])
    if choose_to_log_data:
        logger.info("\nbig vector shape\n %s", sparse_matrix_combined.shape)
    if log_vectorised_words:
        logger.info("\nFeatures of vectorizer_character\n %s",
                    vect_char.get_feature_names())
        logger.info("\nRemoved Features of vectorizer_character \n %s",
                    vect_char.get_stop_words())
        logger.info("\nHyperparameters of vectorizer_character\n %s",
                    vect_char.fit(list_of_strings))
        logger.info("\nFeatures of vectorizer_word\n %s",
                    vect_word.get_feature_names())
        logger.info("\nRemoved Features of vectorizer_word \n %s",
                    vect_word.get_stop_words())
        logger.info("\nHyperparameters of vectorizer_word\n %s",
                    vect_word.fit(list_of_strings))
    return sparse_matrix_combined

コード例 #7

0

ファイルを表示

ファイル: Qualitative.py プロジェクト: agasthyaps/TLL

def make_tfidf_matrix(doc):

    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    stops = list(tfidf.get_stop_words()) + [
        'education', 'students', 'students', 'school', 'learning', 'learn',
        'experience', 'teach', 'working'
    ]
    tfidf.set_params(stop_words=stops)
    tfidf_matrix = tfidf.fit_transform(doc)
    return tfidf_matrix

コード例 #8

0

ファイルを表示

ファイル: general_functions.py プロジェクト: davidabelman/GuardianVizFlask

def create_stopword_list(extra_words):
	"""
	Creates stopword list (adds extra words to original English set)
	"""
	from sklearn.feature_extraction.text import TfidfVectorizer
	original = list(TfidfVectorizer.get_stop_words(TfidfVectorizer(stop_words='english')))
	if extra_words:
		return frozenset(original+extra_words)
	else:
		return frozenset(original)

コード例 #9

0

ファイルを表示

ファイル: Textmining.py プロジェクト: good4488/gomine

def get_stopword_set():
    vectorizer_for_stop = TfidfVectorizer(stop_words='english')
    stop_words = set()
    with open('stopwords_custom.in', 'r', encoding='UTF-8') as f:
        stop_words = stop_words | set(f.read().split())
    with open('SmartStoplist.txt', 'r', encoding='UTF-8') as f:
        stop_words = stop_words | set(f.read().split())
    stop_words = stop_words | set(
        stopwords.words('english')) | vectorizer_for_stop.get_stop_words()
    return stop_words

コード例 #10

0

ファイルを表示

ファイル: Preprocessing.py プロジェクト: aizulfaiz/Software-defect-prediction-CNN-and-RNN

    def pre_processing(self):  # pre-processing function

        AST = None
        src = open(self.name, 'r')

        # loop to parse each source code
        for x in range(1):

            src = src.read()

            attributes = []
            variables = []

            # Source parsing
            try:
                AST = javalang.parse.parse(src)  # This will return AST
                for path, node in AST:  # Index, Element
                    if 'ReferenceType' != node:
                        AST.remove(node)
                    print(node, "\n")
                    # print(path,"\n")
            except:
                pass

        vectorizer = TfidfVectorizer(
            stop_words='english')  # Create the vectorize/transform

        vectorizer.fit(
            [str(AST)]
        )  # Learns vocab " CompilationUnit, Imports, path, static, true, util, io "

        print(
            '---------------------------check 2----------------------------------'
        )
        print(vectorizer.vocabulary_)
        print("STOPPPPING WORDS", vectorizer.get_stop_words())
        vector = vectorizer.transform([str(AST)
                                       ])  # transform document to matrix
        print(vector)
        print(
            '---------------------check 3-------------------------------------------------------------'
        )
        a = np.array(vector.toarray())
        print(a)
        print(
            '---------------------check 4-------------------------------------------------------------'
        )
        df = DataFrame(a)
        print(df)
        # print("Features")
        # print(vectorizer.get_feature_names())
        df.to_csv('featuresExtraction.csv',
                  mode='a',
                  header=False,
                  index=False)

コード例 #11

0

ファイルを表示

def create_stopword_list(extra_words):
    """
	Creates stopword list (adds extra words to original English set)
	"""
    from sklearn.feature_extraction.text import TfidfVectorizer
    original = list(
        TfidfVectorizer.get_stop_words(TfidfVectorizer(stop_words='english')))
    if extra_words:
        return frozenset(original + extra_words)
    else:
        return frozenset(original)

コード例 #12

0

ファイルを表示

ファイル: 【NLP】3、TF-IDF反向词频.py プロジェクト: oliverjj/NLP

class tdidf(object):
    """
    步骤一：利用tfidf解析特征和label
    步骤二：利用朴素贝叶斯分类
    """
    def __init__(self, datas):

        self.movie_reviews = load_files(datas)  #数据需要分析的文件夹
        #doc_terms_train, doc_terms_test, y_train, y_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.3)
        '''''BOOL型特征下的向量空间模型，注意，测试样本调用的是transform接口'''
        self.count_vec = TfidfVectorizer(
            binary=False, decode_error='ignore', stop_words='english'
        )  #CountVectorizer对应词频权重或是BOOL型权重(通过参数binary调节)向量空间模型
        self.count_vec.get_stop_words(
        )  #使用count_vec.get_stop_words()查看TfidfVectorizer内置的所有停用词

        self.a = self.count_vec.fit_transform(self.movie_reviews.data)
        self.count_vec.get_feature_names()  #x按照count_vec分词结果

        self.x = self.a.toarray()
        self.y = self.movie_reviews.target
        print(self.x)  #tf-idf矩阵

    def train(self):
        #加载数据集，切分数据集80%训练，20%测试
        z = 10
        the_all = 0
        lists = []
        for i in range(0, z):
            x_train, x_test, y_train, y_test = train_test_split(self.x,
                                                                self.y,
                                                                test_size=0.2,
                                                                random_state=i)
            #调用MultinomialNB分类器
            clf = MultinomialNB().fit(x_train,
                                      y_train)  #clf:首先引入模块；其次将模块fit到训练样本中
            doc_class_predicted = clf.predict(x_test)
            yy = round(float(np.mean(doc_class_predicted == y_test)), 2)
            lists.append(yy)
        return lists

コード例 #13

0

ファイルを表示

def main():
    """
    数据预处理
    """

    le = LabelEncoder()
    x = [1, 2, 2, 6, 8, 12, 45, 23]
    le.fit(x)
    tsf = le.transform(x)
    print(tsf)
    #非数值型转化为数值型
    y = ["名称", "产地", "编号", "属性", "功能", "规格"]
    le.fit(y)
    str_tsf = le.transform(y)
    print(np.unique(y))
    print(str_tsf)

    # （零均值规范化）。计算方式是将特征值减去均值，除以标准差。
    print(scale(x))

    # 特征标准化
    sts = StandardScaler()
    # scaler = sts.fit(x)
    # print(scaler.transform(x))

    # 规范化将不同变化范围的值映射到相同的固定范围，常见的是[0,1]，此时也称为归一化
    x_t = [[1, -1, 2], [2, 0, 0], [0, 1, -1]]
    print(preprocessing.normalize(x_t, norm='l2'))

    # 文本特征抽取与向量化

    # sklearn.datasets支持从目录读取所有分类好的文本。
    # 不过目录必须按照一个文件夹一个标签名的规则放好。比如本文使用的数据集共有2个标签，一个为“net”，一个为“pos”
    movie_reviews = load_files('./data/endata')
    print(movie_reviews.data)
    print(movie_reviews.target)
    doc_train, doc_test, y_train, y_test = train_test_split(
        movie_reviews.data, movie_reviews.target, test_size=0.3)

    print(doc_train)
    # 词频统计
    count_vec = TfidfVectorizer(binary=False, stop_words=["english"])

    x_train = count_vec.fit_transform(doc_train)

    print(count_vec.get_feature_names())
    print(x_train.toarray())
    print(movie_reviews.target)
    # 查看停用词
    print(count_vec.get_stop_words())

コード例 #14

0

ファイルを表示

ファイル: calc_idf.py プロジェクト: kurtespinosa/chainer-abcnn

def main(fi):
    questions = [" ".join(json.loads(line)['question']) for line in fi]
    model = TfidfVectorizer(stop_words="english")
    model.fit(questions)

    word2idf = {word:model.idf_[idx] for word, idx in model.vocabulary_.items()}
    sys.stderr.write("Saving IDF...")
    with open("../../work/idf.json", 'w') as fo:
        fo.write(json.dumps(word2idf))
    sys.stderr.write("Done.\n")

    sys.stderr.write("Saving StopWords...")
    with open("../../work/stopwords.txt", 'w') as fo:
        fo.write(json.dumps(list(model.get_stop_words())))
    sys.stderr.write("Done.\n")

コード例 #15

0

ファイルを表示

def make_stop_words():
    '''
        Take in list of user-created stop words and join with Tfidf 'english' stop words.

        INPUT:
        - None

        OUTPUT:
        - New master list of stop words including user and model inputs
    '''
    new_stop_words = ['ha', "\'s", 'tt', 'ireach', "n\'t", 'wo', 'pv', 'tm', 'anite', 'rabichev', 'russell', '603', 'hana', 'atmel', 'radwin', 'se', 'doxee', 'lantto', 'publ', 'fpc1025', '855', 'il', '0344']
    # create temporary TfidfVectorizer object
    tfidf_temp = TfidfVectorizer(stop_words='english')
    # get Tfidf 'english' stop words from model
    stop_words = tfidf_temp.get_stop_words()
    # combine two lists of stop words
    result = list(stop_words) + new_stop_words
    return result

コード例 #16

0

ファイルを表示

ファイル: tfidf_test.py プロジェクト: eastshine94/studio

    def test11(self):
        corpus = [
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.', 'Is this the first document?'
        ]

        vectorizer = TfidfVectorizer()
        # X = vectorizer.fit_transform(corpus)
        model = vectorizer.fit(corpus)
        X = model.transform(corpus)
        print(vectorizer.get_feature_names())
        print(vectorizer.get_stop_words())
        print(vectorizer.inverse_transform(X))
        print(X.shape)
        print(X)
        print(model.vocabulary_)
        print(model.idf_)

コード例 #17

0

ファイルを表示

    def get_stop_words(self):
        tfidf = TfidfVectorizer(stop_words='english')

        stop_words = list(tfidf.get_stop_words())

        stop_words += ['qb_y', 'qb_n', 'lngnod']

        stop_words_stemmed = [stemmer.stem(x) for x in stop_words]
        good_words = [
            'no', 'none', 'not', 'nothing', 'back', 'between', 'found',
            'front', 'find', 'without', 'above', 'almost', 'under', 'among',
            'together', 'serious', 'less', 'each', 'bottom', 'full', 'empty'
        ]
        good_words_stemmed = [stemmer.stem(x) for x in good_words]
        for word in good_words_stemmed:
            stop_words_stemmed.remove(word)

        return stop_words_stemmed

コード例 #18

0

ファイルを表示

ファイル: utils.py プロジェクト: Wyoc/DNEmbedding

def read_data(dataset):
    # features
    if dataset in ["nyt", "dblp", "cora2"]:
        content = pd.read_csv("data/" + dataset + "/features.txt",
                              sep="\t",
                              header=None,
                              quoting=3)
        vectorizer = TfidfVectorizer(lowercase=True,
                                     analyzer="word",
                                     stop_words="english",
                                     max_df=0.25,
                                     min_df=4,
                                     norm='l2',
                                     use_idf=True)
        features = vectorizer.fit_transform(content[1].values)
        vectorizerTF = TfidfVectorizer(lowercase=True,
                                       analyzer="word",
                                       stop_words="english",
                                       max_df=0.25,
                                       min_df=4,
                                       norm=None,
                                       use_idf=False)
        tf = vectorizerTF.fit_transform(content[1].values)
        tokenizer = RegexpTokenizer(r'\w+')
        raw = [tokenizer.tokenize(i.lower()) for i in content[1].values]
        en_stop = vectorizer.get_stop_words()
        for i in range(len(raw)):
            raw[i] = [word for word in raw[i] if not word in en_stop]
        voc = vectorizer.get_feature_names()
    else:
        print("Unknown dataset: %s" % dataset)
        return None
    # graph
    graph = nx.read_adjlist("data/" + dataset + "/graph.txt", nodetype=int)
    A = nx.to_scipy_sparse_matrix(graph,
                                  nodelist=range(features.shape[0]),
                                  format="csr")
    # labels
    groups = np.loadtxt("data/" + dataset + "/group.txt",
                        delimiter="\t",
                        dtype=int)
    groups = groups[:, 1]
    return features, groups, A, graph, voc, raw, tf

コード例 #19

0

ファイルを表示

ファイル: math_scraping_and_recommending_functions.py プロジェクト: ColinFerguson/Project

def math_stop():
    '''Add math specific words to the standard stop list'''
    tfidf = TfidfVectorizer(stop_words='english')
    Stop = set()
    Stop.update([word for word in tfidf.get_stop_words()])
    Stop.update(['theorem', 'denote', 'like', 'thank', 'lemma', 'proof',
                'sum', 'difference', 'corollary', 'hand',
                 'product', 'multiple', 'let', 'group',
                 'prime', 'log', 'limit', 'cid', 'result',
                 'main', 'conjecture', 'case', 'suppose',
                 'function', 'assume', 'follows',
                 'given', 'define', 'note', 'defined', 'class',
                 'proposition', 'function', 'set',
                 'primes', 'numbers', 'form', 'integers', 'curves',
                 'real', 'using', 'following', 'obtain', 'prove',
                 'definition', 'large', 'small', 'action', 'define',
                         'bound', 'sufficiently', 'subject', 'non',
                          'mathematics'])
    return list(Stop)

コード例 #20

0

ファイルを表示

def get_data():
    print('开始加载数据.')
    t_start = time()
    remove = ()
    categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'
    data_train = fetch_20newsgroups(subset='train',
                                    categories=categories,
                                    random_state=0,
                                    shuffle=True,
                                    remove=remove)
    data_test = fetch_20newsgroups(subset='test',
                                   categories=categories,
                                   random_state=0,
                                   shuffle=True,
                                   remove=remove)
    t_end = time()
    print('加载时间:', t_start - t_end)
    print('数据类型', type(data_train))
    print('训练集包含文本个数', len(data_train.data))
    print('测试集包含文本个数', len(data_test.data))
    print('类别名称:\n', categories)
    y_train = data_train.target
    y_test = data_test.target
    categories = data_train.target_names
    print('--------展示前十个样本--------')
    for i in range(10):
        print('文本{}属于类别{}'.format(i + 1, categories[y_train[i]]))
        print(data_train.data[i])
        print('\n\n\n')
    vectorizer = TfidfVectorizer(input='content',
                                 stop_words='english',
                                 max_df=0.5,
                                 sublinear_tf=True)
    x_train = vectorizer.fit_transform(data_train.data)
    x_test = vectorizer.transform(data_test.data)
    print(u'训练集样本个数：%d，特征个数：%d' % x_train.shape)
    print(u'停止词:\n', )
    pprint(vectorizer.get_stop_words())
    feature_names = np.asarray(vectorizer.get_feature_names())
    return x_train, y_train, x_test, y_test

コード例 #21

0

ファイルを表示

ファイル: tf_idf_model.py プロジェクト: Cawinchan/kaggletoxic

def tf_idf_vectorizer_small(list_of_strings,
                            choose_to_log_data=True,
                            log_vectorised_words=False,
                            logger=None):
    """
    function should return tf-idf logistic regression score
    :param : list
    :type : string
    :return: sparse matrix
    :rtype: value
    """
    vect_word = TfidfVectorizer(stop_words='english', min_df=20)
    sparse_matrix_word = vect_word.fit_transform(list_of_strings)
    if choose_to_log_data:
        logger.info("\nsmall vector shape %s", sparse_matrix_word.shape)
    if log_vectorised_words:
        logger.info("\nFeatures of vectorizer_word %s",
                    vect_word.get_feature_names())
        logger.info("\nRemoved Features of vectorizer_word  %s",
                    vect_word.get_stop_words())
        logger.info("\nHyperparameters of vectorizer_word %s",
                    vect_word.fit(list_of_strings))
    return sparse_matrix_word

コード例 #22

0

ファイルを表示

ファイル: vectorize_text.py プロジェクト: jeremysandor/ud120-projects

pickle.dump( from_data, open("your_email_authors.pkl", "w") )





### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer

transformer = TfidfVectorizer(stop_words="english")
theFit = transformer.fit_transform(word_data)
print('theFit', theFit)
afterTransform = theFit.toarray()
print(afterTransform)

stopWords = transformer.get_stop_words()
# print('stopWords', len(stopWords))

featureNames = transformer.get_feature_names()
# print(featureNames[34597])
print('featureNames', len(featureNames))


# shove matrix gained from TfidfVectorizer into
# k-means clustering
# just wanted to experiment
from sklearn.cluster import MiniBatchKMeans
clf = MiniBatchKMeans(n_clusters=4, n_init=10, max_iter=300)
clf.fit(afterTransform)
pred = clf.predict(afterTransform)
print('PRED', pred)

コード例 #23

0

ファイルを表示

ファイル: vectorize_text.py プロジェクト: hushenglang/ud120-projects

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name=='sara':
            from_data.append(0)
        else:
            from_data.append(1)

        email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )


# print word_data[152]



### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfv = TfidfVectorizer(stop_words="english", lowercase=True)
v_word_data = tfidfv.fit_transform(word_data)
print len(tfidfv.get_feature_names())
print tfidfv.get_feature_names()
print tfidfv.get_feature_names()[34597]
print tfidfv.get_stop_words()

コード例 #24

0

ファイルを表示

def make_stop_words(new_words_list):
    tfidf_temp = TfidfVectorizer(stop_words='english')
    stop_words = tfidf_temp.get_stop_words()
    result = list(stop_words) + new_words_list
    return result

コード例 #25

0

ファイルを表示

all_classes = np.array(['positive', 'negative'])
for l in range(len(pos_trainer)):
    pos_train = pos_train + ['positive']
for l in range(len(neg_trainer)):
    neg_train = neg_train + ['negative']
y_train = pos_train + neg_train
X_train = tfidf.fit_transform(pos_trainer + neg_trainer)
#n_tf = tfidf.fit_transform(neg_trainer)

clf = MultinomialNB().fit(X_train, y_train)
#clf = MultinomialNB().partial_fit(n_tf, neg_train)

#clf.partial_fit(n_tf, neg_train)

fn = tfidf.get_feature_names()
stop = tfidf.get_stop_words()

if statement == 'red':
    comment_list = redditor.redditor()

    for comment in range(len(comment_list)):
        statement = comment_list[comment]
        X_test = tfidf.transform(np.array([statement]))
        predicted = clf.predict(X_test)
        prob = clf.predict_proba(X_test)
        print "Tweet: " + str(l)
        print statement
        print predicted, prob

X_test = tfidf.transform(np.array([statement]))
predicted = clf.predict(X_test)

コード例 #26

0

ファイルを表示

        logging.info("#" * 10)
        logging.info("Training TfidfVectorizer")
        logging.info("#" * 10)

        start = time.time()
        tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=STOP_WORDS, decode_error='replace',
                                tokenizer=LemmaTokenizer())
        queries_index_transformed = tfidf.fit_transform(queries_index)
        logging.info("%s fit in %s s" % (len(queries_index), time.time() - start))

        start = time.time()
        # logging.info("Selects feature pairs..")
        # queries_items_fields_vecs = [queries_index_transformed[feats_ids] for feats_ids in queries_items_fields]

        # print(tfidf.__dict__)
        tfidf_stopwords = tfidf.get_stop_words()

        logging.info("Done in %s s" % (time.time() - start))

        start = time.time()
        logging.info("Transforming %s paragraphs.." % len(paragraphs))
        para_features = tfidf.transform(paragraphs)
        logging.info("%s fit_transform in %s s" % (len(paragraphs), time.time() - start))

        start = time.time()

        res_dists = []
        proc_list = []
        add_label = []

        max_memory = 10 * 1024 * 1024 * 1024  # 10GB

コード例 #27

0

ファイルを表示

ファイル: DEDA_demo_20newsgroups.py プロジェクト: riskyharsh/demotextmining

print(newsgroups_subset.data[text_number])

# A vectorizer function is implemented in the Scikit-Learn package. You can directly filter out stopwords.

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(newsgroups_subset.data).toarray()
y = newsgroups_subset.target

print(type(X))
print(X.shape)

# The list of stopwords can be displayed by the following attribute:

print(vect.get_stop_words())

# In a next step the dataset is split into 90% training data and 10% test data.

n_samples = X.shape[0]
X_train = X[:int(.9 * n_samples)]
y_train = y[:int(.9 * n_samples)]
X_test = X[int(.9 * n_samples):]
y_test = y[int(.9 * n_samples):]

print(X_train.shape)

# Feature Selection

# Features are now selected according to their Chi2 value.

コード例 #28

0

ファイルを表示

from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "w"))
pickle.dump(from_data, open("your_email_authors.pkl", "w"))

### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer

transformer = TfidfVectorizer(stop_words="english")
theFit = transformer.fit_transform(word_data)
print('theFit', theFit)
afterTransform = theFit.toarray()
print(afterTransform)

stopWords = transformer.get_stop_words()
# print('stopWords', len(stopWords))

featureNames = transformer.get_feature_names()
# print(featureNames[34597])
print('featureNames', len(featureNames))

# shove matrix gained from TfidfVectorizer into
# k-means clustering
# just wanted to experiment
from sklearn.cluster import MiniBatchKMeans
clf = MiniBatchKMeans(n_clusters=4, n_init=10, max_iter=300)
clf.fit(afterTransform)
pred = clf.predict(afterTransform)
print('PRED', pred)

コード例 #29

0

ファイルを表示

ファイル: tf-idf-corpus.py プロジェクト: tomlai19852004/DSPersonal

	all_docs.append(read_merge_data(ff))

print all_docs
print len(all_docs)

tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english')

try:
	results = tfidf_vec.fit_transform(all_docs)
	print results.get_shape()	
	result_as_array = results.toarray()
	feature_names = tfidf_vec.get_feature_names()

	total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10)
	print total_highest_scores	
	print tfidf_vec.get_stop_words()
except ValueError:
	pass

# documents = read_data(directory)
# first_five = documents[0:100]
# tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english')

# try:
# 	results = tfidf_vec.fit_transform(first_five)
# 	print results.get_shape()	
# 	result_as_array = results.toarray()
# 	feature_names = tfidf_vec.get_feature_names()

# 	total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10)
# 	print total_highest_scores

コード例 #30

0

ファイルを表示

ファイル: twitter_city_text_processing.py プロジェクト: scsilver/moodjournal

    pos_train = pos_train+['positive']
for l in range(len(neg_trainer)):
    neg_train = neg_train+['negative']
y_train = pos_train+neg_train
X_train = tfidf.fit_transform(pos_trainer+neg_trainer)
#n_tf = tfidf.fit_transform(neg_trainer)

clf = MultinomialNB().fit(X_train, y_train)
#clf = MultinomialNB().partial_fit(n_tf, neg_train)

#clf.partial_fit(n_tf, neg_train)



fn = tfidf.get_feature_names()
stop = tfidf.get_stop_words()

if statement == 'red':
    comment_list = redditor.redditor()


    for comment in range(len(comment_list)):
        statement = comment_list[comment]
        X_test = tfidf.transform(np.array([statement]))
        predicted = clf.predict(X_test)
        prob = clf.predict_proba(X_test)
        print "Tweet: " + str(l)
        print statement
        print predicted, prob

X_test = tfidf.transform(np.array([statement]))

コード例 #31

0

ファイルを表示

ファイル: prnews_mini.py プロジェクト: NashC/prnewswire_api

def make_stop_words(new_words_list):
	tfidf_temp = TfidfVectorizer(stop_words='english')
	stop_words = tfidf_temp.get_stop_words()
	result = list(stop_words) + new_words_list
	return result

コード例 #32

0

ファイルを表示

ファイル: 贝叶斯网络.py プロジェクト: dupeng2014/strawman_blog

    print(u'测试集包含的文本数目：', len(data_test.data))
    print(u'训练集和测试集使用的%d个类别的名称：' % len(categories))
    categories = data_train.target_names
    pprint(categories)
    y_train = data_train.target
    y_test = data_test.target
    print(u' -- 前10个文本 -- ')
    for i in np.arange(10):
        print(u'文本%d(属于类别 - %s)：' % (i+1, categories[y_train[i]]))
        print(data_train.data[i])
        print('\n\n')
    vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True)
    x_train = vectorizer.fit_transform(data_train.data)  # x_train是稀疏的，scipy.sparse.csr.csr_matrix
    x_test = vectorizer.transform(data_test.data)
    print(u'训练集样本个数：%d，特征个数：%d' % x_train.shape)
    print(u'停止词:\n', pprint(vectorizer.get_stop_words()))
    feature_names = np.asarray(vectorizer.get_feature_names())

    print(u'\n\n===================\n分类器的比较：\n')
    clfs = (MultinomialNB(),                # 0.87(0.017), 0.002, 90.39%
            BernoulliNB(),                  # 1.592(0.032), 0.010, 88.54%
            )
    result = []
    for clf in clfs:
        a = test_clf(clf)
        result.append(a)
        print('\n')
    result = np.array(result)
    time_train, time_test, err, names = result.T
    time_train = time_train.astype(np.float)
    time_test = time_test.astype(np.float)

コード例 #33

0

ファイルを表示

            new_str.replace('germani', '')
            ### append the text to word_data
            word_data.append(new_str)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if from_person == 'Sara':
                from_data.append(0)
            elif from_person == 'Chris':
                from_data.append(1)

            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "w"))
pickle.dump(from_data, open("your_email_authors.pkl", "w"))

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
data = vectorizer.fit_transform(word_data)
vectorizer = vectorizer.fit(word_data)
stop_words = vectorizer.get_stop_words()

print len(vectorizer.get_feature_names())
#print word_data[152]
# 385 with 200 samples
# 38825 with all samples

### in Part 4, do TfIdf vectorization here

コード例 #34

0

ファイルを表示

class TfIdf:
    def __init__(self, corpus=None):
        self.tfidf = TfidfVectorizer(analyzer='word',
                                     ngram_range=(1, 3),
                                     min_df=0,
                                     max_df=0.9,
                                     stop_words='english')
        self.tfidf_matrix = None
        self.corpus = corpus
        if corpus != None:
            self.calcTFidf(corpus)

    #Todo: get highest ranked words of each document, do tfidf on srt
    #Todo: tfidf on the query itself(to limit search on words)
    #idf gives  the inverse of number of times word appears in other documents, get words with high idf
    #words with just high tf
    #phrase queries with tfidf to match highest score....
    def testQuery(self, query):
        response = self.calcTFidfQuery(query)
        topChoices = self.calcCosineSim(response)
        self.printTopChoices(topChoices)

    def calcCosineSim(self, queryTfidfVec, top_n=10):
        if self.tfidf_matrix is None:
            print 'Error! tfidfMatrix has not yet been initialized.'
        else:
            cosine_similarities = linear_kernel(queryTfidfVec,
                                                self.tfidf_matrix).flatten()
            related_docs_indices = [
                i for i in cosine_similarities.argsort()[::-1]
            ]
            return [(index, cosine_similarities[index])
                    for index in related_docs_indices][0:top_n]

    def calcTFidfQuery(self, query):
        response = self.tfidf.transform([query])
        feature_names = self.tfidf.get_feature_names()
        print "Query"
        #print response #response- sk learn matrix: (doc_num, feature_name): tfidf score
        #print feature_names
        for col in response.nonzero()[1]:
            print feature_names[col], ', ', response[0, col]
        return response

    def calcTFidf(self, corpus):
        self.tfidf_matrix = self.tfidf.fit_transform(corpus)
        feature_names = self.tfidf.get_feature_names()
        dense = self.tfidf_matrix.todense()
        stopWordsSet = self.tfidf.get_stop_words()  #the stopWordsSet

        #TODO: figure out this part, what is this doiong?
        currDoc = dense[1].tolist()[0]  #filter out to get doc (i+1)
        phrase_scores = [
            pair for pair in zip(range(0, len(currDoc)), currDoc)
            if pair[1] > 0
        ]  #pair of featurename to feature score
        sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
        print phrase_scores
        print "Sorted scores"
        print sorted_phrase_scores
        for scoreTuple in sorted_phrase_scores:
            phraseToScore = str(feature_names[scoreTuple[0]]) + ', ' + str(
                scoreTuple[1])
            print phraseToScore

        #for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
        #    print('{0: <20} {1}'.format(phrase, score))

    def printTopChoices(self, topChoices):
        print "TFIDF:"
        print topChoices
        for choice in topChoices:
            print self.corpus[choice[0]]

コード例 #35

0

ファイルを表示

ファイル: tf-idf-corpus.py プロジェクト: tomlai19852004/DSPersonal

tfidf_vec = TfidfVectorizer(analyzer='word',
                            ngram_range=(1, 3),
                            min_df=1,
                            stop_words='english')

try:
    results = tfidf_vec.fit_transform(all_docs)
    print results.get_shape()
    result_as_array = results.toarray()
    feature_names = tfidf_vec.get_feature_names()

    total_highest_scores = get_highest_scoring_feature(result_as_array,
                                                       feature_names, -10)
    print total_highest_scores
    print tfidf_vec.get_stop_words()
except ValueError:
    pass

# documents = read_data(directory)
# first_five = documents[0:100]
# tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english')

# try:
# 	results = tfidf_vec.fit_transform(first_five)
# 	print results.get_shape()
# 	result_as_array = results.toarray()
# 	feature_names = tfidf_vec.get_feature_names()

# 	total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10)
# 	print total_highest_scores

コード例 #36

0

ファイルを表示

        count -= 1
        if count == 0:
            pass
print "Finished reading."


def tokenize(text):
    text.split(' ')
    return text.split(' ')


vectorizer = TfidfVectorizer(tokenizer=tokenize,
                             min_df=5,
                             max_df=0.03,
                             stop_words='english')
m = vectorizer.fit_transform(corpus)
print "Finished tf-idf.", m.shape[0], m.shape[1]

print vectorizer.get_stop_words()
phrases = vectorizer.get_feature_names()
with open(output_file, 'w') as output:
    for i in xrange(m.shape[0]):
        d = m.getrow(i)
        s = zip(d.indices, d.data)
        sorted_s = sorted(s, key=lambda v: v[1], reverse=True)
        indices = [element[0] for element in sorted_s]
        for i in range(min(top_K, len(indices))):
            output.write(phrases[indices[i]])
            output.write(' ')
        output.write('\n')

コード例 #37

0

ファイルを表示

ファイル: 03.py プロジェクト: zscFirefly/ML_From_xx

    y_test = data_test.target  # 目标分类 转化成[0,1,2,3]
    print("--前10个文本--")
    for i in np.arange(10):
        print("文本%d(属于类别 - %s)：" % (i + 1, categories[y_train[i]]))
        print(data_train.data[i])
        print("\n\n\n")

    vectorizer = TfidfVectorizer(input='content',
                                 stop_words='english',
                                 max_df=0.5,
                                 sublinear_tf=True)  # 设置模型参数
    x_train = vectorizer.fit_transform(data_train.data)
    x_test = vectorizer.transform(data_test.data)
    print("训练集样本个数：%d，特征个数：%d" % x_train.shape)
    print("停用词:\n")
    pprint(vectorizer.get_stop_words())
    feature_names = np.asarray(vectorizer.get_feature_names())

    print("\n\n================\n分类器的比较：\n")
    clfs = (
        MultinomialNB(), BernoulliNB()
        # KNeighborsClassifier()
        # RidgeClassifier()
        # RandomForestClassifier(n_estimators=200)
        # SVC()
    )
    result = []
    for clf in clfs:
        a = test_clf(clf)
        result.append(a)
        print('\n')

コード例 #38

0

ファイルを表示

class TfidfRecommender:
    """Term Frequency - Inverse Document Frequency (TF-IDF) Recommender

    This class provides content-based recommendations using TF-IDF vectorization in combination with cosine similarity.
    """

    def __init__(self, id_col, tokenization_method="scibert"):
        """Initialize model parameters

        Args:
            id_col (str): Name of column containing item IDs.
            tokenization_method (str): ['none','nltk','bert','scibert'] option for tokenization method.
        """
        self.id_col = id_col
        if tokenization_method.lower() not in ["none", "nltk", "bert", "scibert"]:
            raise ValueError(
                'Tokenization method must be one of ["none" | "nltk" | "bert" | "scibert"]'
            )
        self.tokenization_method = tokenization_method.lower()

        # Initialize other variables used in this class
        self.tf = TfidfVectorizer()
        self.tfidf_matrix = dict()
        self.tokens = dict()
        self.stop_words = frozenset()
        self.recommendations = dict()
        self.top_k_recommendations = pd.DataFrame()

    def __clean_text(self, text, for_BERT=False, verbose=False):
        """ Clean text by removing HTML tags, symbols, and punctuation.
        
        Args:
            text (str): Text to clean.
            for_BERT (boolean): True or False for if this text is being cleaned for a BERT word tokenization method.
            verbose (boolean): True or False for whether to print.
        
        Returns:
            clean (str): Cleaned version of text.
        """

        try:
            # Normalize unicode
            text_norm = unicodedata.normalize("NFC", text)

            # Remove HTML tags
            clean = re.sub("<.*?>", "", text_norm)

            # Remove new line and tabs
            clean = clean.replace("\n", " ")
            clean = clean.replace("\t", " ")
            clean = clean.replace("\r", " ")
            clean = clean.replace("Â\xa0", "")  # non-breaking space

            # Remove all punctuation and special characters
            clean = re.sub("([^\s\w]|_)+", "", clean)

            # If you want to keep some punctuation, see below commented out example
            # clean = re.sub('([^\s\w\-\_\(\)]|_)+','', clean)

            # Skip further processing if the text will be used in BERT tokenization
            if for_BERT is False:
                # Lower case
                clean = clean.lower()
        except:
            if verbose is True:
                print("Cannot clean non-existent text")
            clean = ""

        return clean

    def clean_dataframe(self, df, cols_to_clean, new_col_name="cleaned_text"):
        """ Clean the text within the columns of interest and return a dataframe with cleaned and combined text.
        
        Args:
            df (pd.DataFrame): Dataframe containing the text content to clean.
            cols_to_clean (list of str): List of columns to clean by name (e.g., ['abstract','full_text']).
            new_col_name (str): Name of the new column that will contain the cleaned text.

        Returns:
            df (pd.DataFrame): Dataframe with cleaned text in the new column.
        """
        # Collapse the table such that all descriptive text is just in a single column
        df = df.replace(np.nan, "", regex=True)
        df[new_col_name] = df[cols_to_clean].apply(lambda cols: " ".join(cols), axis=1)

        # Check if for BERT tokenization
        if self.tokenization_method in ["bert", "scibert"]:
            for_BERT = True
        else:
            for_BERT = False

        # Clean the text in the dataframe
        df[new_col_name] = df[new_col_name].map(
            lambda x: self.__clean_text(x, for_BERT)
        )

        return df

    def tokenize_text(
        self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0
    ):
        """ Tokenize the input text.
            For more details on the TfidfVectorizer, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
        
        Args:
            df_clean (pd.DataFrame): Dataframe with cleaned text in the new column.
            text_col (str): Name of column containing the cleaned text.
            ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted.
            min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
        
        Returns:
            tf (TfidfVectorizer): Scikit-learn TfidfVectorizer object defined in .tokenize_text().
            vectors_tokenized (pd.Series): Each row contains tokens for respective documents separated by spaces.
        """
        vectors = df_clean[text_col]

        # If a HuggingFace BERT word tokenization method
        if self.tokenization_method in ["bert", "scibert"]:
            # Set vectorizer
            tf = TfidfVectorizer(
                analyzer="word",
                ngram_range=ngram_range,
                min_df=min_df,
                stop_words="english",
            )

            # Get appropriate transformer name
            if self.tokenization_method == "bert":
                bert_method = "bert-base-cased"
            elif self.tokenization_method == "scibert":
                bert_method = "allenai/scibert_scivocab_cased"

            # Load pre-trained model tokenizer (vocabulary)
            tokenizer = BertTokenizer.from_pretrained(bert_method)

            # Loop through each item
            vectors_tokenized = vectors.copy()
            for i in range(0, len(vectors)):
                vectors_tokenized[i] = " ".join(tokenizer.tokenize(vectors[i]))

        elif self.tokenization_method == "nltk":
            # NLTK Stemming
            token_dict = {}
            stemmer = PorterStemmer()

            def stem_tokens(tokens, stemmer):
                stemmed = []
                for item in tokens:
                    stemmed.append(stemmer.stem(item))
                return stemmed

            def tokenize(text):
                tokens = nltk.word_tokenize(text)
                stems = stem_tokens(tokens, stemmer)
                return stems

            # When defining a custome tokenizer with TfidfVectorizer, the tokenization is applied in the fit function
            tf = TfidfVectorizer(
                tokenizer=tokenize,
                analyzer="word",
                ngram_range=ngram_range,
                min_df=min_df,
                stop_words="english",
            )
            vectors_tokenized = vectors

        elif self.tokenization_method == "none":
            # No tokenization applied
            tf = TfidfVectorizer(
                analyzer="word",
                ngram_range=ngram_range,
                min_df=min_df,
                stop_words="english",
            )
            vectors_tokenized = vectors

        # Save to class variable
        self.tf = tf

        return tf, vectors_tokenized

    def fit(self, tf, vectors_tokenized):
        """ Fit TF-IDF vectorizer to the cleaned and tokenized text.

        Args:
            tf (TfidfVectorizer): Scikit-learn TfidfVectorizer object defined in .tokenize_text().
            vectors_tokenized (pd.Series): Each row contains tokens for respective documents separated by spaces.
        """
        self.tfidf_matrix = tf.fit_transform(vectors_tokenized)

    def get_tokens(self):
        """ Return the tokens generated by the TF-IDF vectorizer.

        Returns:
            self.tokens (dict): Dictionary of tokens generated by the TF-IDF vectorizer.
        """
        try:
            self.tokens = self.tf.vocabulary_
        except:
            self.tokens = "Run .tokenize_text() and .fit_tfidf() first"
        return self.tokens

    def get_stop_words(self):
        """ Return the stop words excluded in the TF-IDF vectorizer.

        Returns:
            self.stop_words (frozenset): Frozenset of stop words used by the TF-IDF vectorizer (can be converted to list).
        """
        try:
            self.stop_words = self.tf.get_stop_words()
        except:
            self.stop_words = "Run .tokenize_text() and .fit_tfidf() first"
        return self.stop_words

    def __create_full_recommendation_dictionary(self, df_clean):
        """ Create the full recommendation dictionary containing all recommendations for all items.
        
        Args:
            df_clean (pd.DataFrame): Dataframe with cleaned text.
        """

        # Similarity measure
        cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)

        results = {}
        for idx, row in df_clean.iterrows():
            similar_indices = cosine_sim[idx].argsort()[: -(len(df_clean) + 1) : -1]
            similar_items = [
                (cosine_sim[idx][i], df_clean[self.id_col][i]) for i in similar_indices
            ]
            results[row[self.id_col]] = similar_items[1:]

        # Save to class
        self.recommendations = results

    def __organize_results_as_tabular(self, df_clean, k):
        """ Restructures results dictionary into a table containing only the top k recommendations per item.
        
        Args:
            df_clean (pd.DataFrame): Dataframe with cleaned text.
            k (int): Number of recommendations to return.
        """
        # Initialize new dataframe to hold recommendation output
        item_id = list()
        rec_rank = list()
        rec_score = list()
        rec_item_id = list()

        # For each item
        for idx in range(0, len(self.recommendations)):
            # Information about the item we are basing recommendations off of
            rec_based_on = list(self.recommendations.keys())[idx]
            tmp_item_id = str(
                df_clean.loc[df_clean[self.id_col] == rec_based_on][self.id_col].values[
                    0
                ]
            )

            # Get all scores and IDs for items recommended for this current item
            rec_array = self.recommendations[rec_based_on]
            tmp_rec_score = list(map(lambda x: x[0], rec_array))
            tmp_rec_id = list(map(lambda x: x[1], rec_array))

            # Append multiple values at a time to list
            item_id.extend([tmp_item_id] * k)
            rec_rank.extend(list(range(1, k + 1)))
            rec_score.extend(tmp_rec_score[:k])
            rec_item_id.extend(tmp_rec_id[:k])

        # Save the output
        output_dict = {
            self.id_col: item_id,
            "rec_rank": rec_rank,
            "rec_score": rec_score,
            "rec_" + self.id_col: rec_item_id,
        }

        # Convert to dataframe
        self.top_k_recommendations = pd.DataFrame(output_dict)

    def recommend_top_k_items(self, df_clean, k=5):
        """ Recommend k number of items similar to the item of interest.

        Args:
            df_clean (pd.DataFrame): Dataframe with cleaned text.
            k (int): Number of recommendations to return.
        
        Returns:
            self.top_k_recommendations (pd.DataFrame): Dataframe containing id of top k recommendations for all items.
        """
        if k > len(df_clean) - 1:
            raise ValueError(
                "Cannot get more recommendations than there are items. Set k lower."
            )
        self.__create_full_recommendation_dictionary(df_clean)
        self.__organize_results_as_tabular(df_clean, k)

        return self.top_k_recommendations

    def __get_single_item_info(self, metadata, rec_id):
        """ Get full information for a single recommended item.
        
        Args:
            metadata (pd.DataFrame): Dataframe containing item info.
            rec_id (str): Identifier for recommended item.

        Results:
            rec_info (pd.Series): Single row from dataframe containing recommended item info.
        """

        # Return row
        rec_info = metadata.iloc[int(np.where(metadata[self.id_col] == rec_id)[0])]

        return rec_info

    def __make_clickable(self, address):
        """ Make URL clickable.

        Args:
            address (str): URL address to make clickable.
        """
        return '<a href="{0}">{0}</a>'.format(address)

    def get_top_k_recommendations(
        self, metadata, query_id, cols_to_keep=[], verbose=True
    ):
        """ Return the top k recommendations with useful metadata for each recommendation.

        Args:
            metadata (pd.DataFrame): Dataframe holding metadata for all public domain papers.
            query_id (str): ID of item of interest.
            cols_to_keep (list of str): List of columns from the metadata dataframe to include (e.g., ['title','authors','journal','publish_time','url']). By default, all columns are kept.
            verbose (boolean): Set to True if you want to print the table.
        
        Results:
            df (pd.Styler): Stylized dataframe holding recommendations and associated metadata just for the item of interest (can access as normal dataframe by using df.data).
        """

        # Create subset of dataframe with just recommendations for the item of interest
        df = self.top_k_recommendations.loc[
            self.top_k_recommendations[self.id_col] == query_id
        ].reset_index()

        # Remove id_col of query item
        df.drop([self.id_col], axis=1, inplace=True)

        # Add metadata for each recommended item (rec_<id_col>)
        metadata_cols = metadata.columns.values
        df[metadata_cols] = df.apply(
            lambda row: self.__get_single_item_info(
                metadata, row["rec_" + self.id_col]
            ),
            axis=1,
        )

        # Remove id col added from metadata (already present from self.top_k_recommendations)
        df.drop([self.id_col], axis=1, inplace=True)

        # Rename columns such that rec_ is no longer appended, for simplicity
        df = df.rename(columns={"rec_rank": "rank", "rec_score": "similarity_score"})

        # Only keep columns of interest
        if len(cols_to_keep) > 0:
            # Insert our recommendation scoring/ranking columns
            cols_to_keep.insert(0, "similarity_score")
            cols_to_keep.insert(0, "rank")
            df = df[cols_to_keep]

        # Make URLs clickable if they exist
        if "url" in list(map(lambda x: x.lower(), metadata_cols)):
            format_ = {"url": self.__make_clickable}
            df = df.head().style.format(format_)

        if verbose == True:
            df

        return df