コード例 #1
0
def main():
    content = load_stories()
    # TF-IDF Tester
    tfidf_handler = TFIDF(use_idf=True, ngram=(1, 3))
    tfidf_handler.analyse_corpus(document=content[0])
    df = tfidf_handler.extract_keywords(doc_idx=0)
    print(df.sort_values(by='tfidf', ascending=False))
コード例 #2
0
class FeatureSelector:
    def __init__(self):
        self.table = TFIDF()

    def run(self):
        """
        Generate the features using Top N algorithm
        """
        for dir_name in os.listdir("../data/groups/"):
            if dir_name == '.DS_Store':
                continue

            for file_name in os.listdir("../data/groups/%s" % dir_name):
                if file_name == '.DS_Store':
                    continue

                document_name = "%s/%s" % (dir_name, file_name)
                with open("../data/groups/%s" % document_name, 'r') as f:
                    self.table.add_document(document_name, f.read().lower())

        new_data_set = self.table.top_n_words(3)
        for document_name, words in new_data_set.iteritems():

            directory_name, file_name = document_name.split('/')
            path_name = "../data/features/%s" % directory_name

            if not os.path.exists(path_name):
                os.makedirs(path_name)

            with open("%s/%s" % (path_name, file_name), 'w') as f:
                for word in words:
                    f.write(word)
                    f.write("\n")
コード例 #3
0
class FeatureSelector:
    def __init__(self):
        self.table = TFIDF()

    def run(self, index_file):
        """
        Generate the features using Top N algorithm
        """
        with open(index_file) as f:
            lines = f.readlines()
            for line in lines:
                name = line[:-1]
                with open("../data/scoped/%s" % name, 'r') as d:
                    document = Document(d.read())
                    self.table.add_document(name, document.content_lower)

        new_data_set = self.table.top_n_words(10)
        for document_name, words in new_data_set.iteritems():

            with open("../data/scoped/%s" % document_name, 'r') as d:
                    document = Document(d.read())

            path_name = "../data/features/%s" % document_name

            with open("%s" % path_name, 'w') as f:
                for word in words:
                    for _ in xrange(document.count(word)):
                        f.write(word)
                        f.write("\n")
コード例 #4
0
ファイル: commands.py プロジェクト: itmap/data_analysis
def analysis_data(segment, idf, tfidf, collections):
    """
    输入mongo中的collection名

    对输入的数据集做分词、计算TF、计算IDF、计算TF*IDF操作
    """
    all_count = sum(counts[c] for c in collections)
    logger.info('calculate {}.'.format(collections))
    tfidf_obj = TFIDF(
        collection_names=collections,
        all_count=all_count,
    )
    if segment:
        tfidf_obj.generate_segment_requests_using_pool()


#        tfidf_obj.generate_segment_requests()
    if idf:
        tfidf_obj.generate_idf_requests()
    if tfidf:
        tfidf_obj.generate_tf_idf_requests()
コード例 #5
0
 def __init__(self):
     self.TF = TFIDF()
     self.articles_dir = "articles/"
     self.summaries_dir = "summaries/"
     self.keywords_dir = "keywords/"
コード例 #6
0
class LSA:
    def __init__(self):
        self.TF = TFIDF()
        self.articles_dir = "articles/"
        self.summaries_dir = "summaries/"
        self.keywords_dir = "keywords/"

    def keywords(self, filename, num_topics=5, keywords_per_topic=3):
        text = ""
        with open(filename) as f:
            for line in f:
                text += line

        words = tokenize(text, "word", return_spans=False)
        sentences = tokenize(text, "sentence", return_spans=False)

        wc = {}
        clean_sentences = []
        for sent in sentences:
            clean_sent = {}
            for word in tokenize(sent, "word", return_spans=False):
                word = self.TF.clean(word)
                clean_sent[word] = 1
                wc[word] = wc.get(word, 0) + 1
            clean_sentences.append(clean_sent)

        matrix = []
        for word in wc.keys():
            row = []
            for sent in clean_sentences:
                if word in sent:
                    row.append(self.TF.weight(word, wc[word]))
                else:
                    row.append(0)
            matrix.append(row)

        matrix = numpy.matrix(matrix)
        U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False)

        D = s * Vh

        keywords = []

        for topic in range(num_topics):
            try:
                words = sorted(enumerate([u for u in U[:, topic]]),
                               key=lambda x: x[1])
            except IndexError:
                print "Problem indexing numpy array for", filename, "on topic", topic
                continue
            added = 0
            word_index = 0
            while added < keywords_per_topic and word_index < len(words):
                #print "Looking at", words[word_index], wc.keys()[words[word_index][0]]
                if wc.keys()[words[word_index][0]] not in keywords:
                    keywords.append(wc.keys()[words[word_index][0]])
                    added += 1
                word_index += 1

        return ", ".join(keywords)

    def summarize(self, filename):
        text = ""
        with open(filename) as f:
            for line in f:
                text += line

        words = tokenize(text, "word", return_spans=False)
        sentences = tokenize(text, "sentence", return_spans=False)

        wc = {}
        clean_sentences = []
        for sent in sentences:
            clean_sent = {}
            for word in tokenize(sent, "word", return_spans=False):
                word = self.TF.clean(word)
                clean_sent[word] = 1
                wc[word] = wc.get(word, 0) + 1
            clean_sentences.append(clean_sent)

        matrix = []
        for word in wc.keys():
            #print "adding", word
            row = []
            for sent in clean_sentences:
                if word in sent:
                    row.append(self.TF.weight(word, wc[word]))
                else:
                    row.append(0)
            matrix.append(row)

        matrix = numpy.matrix(matrix)
        #print "matrix", matrix
        U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False)

        #    print "U", U
        #    print "s", s
        #    print "Vh", Vh
        #
        D = s * Vh
        #print "D", D

        num_sentences = 5
        summary_sentence_indices = []

        #for topic in range(3):
        #    print "Topic", topic
        #    sent_weights = D[topic,:]
        #    #top_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1], reverse=True)[:5]
        #    bottom_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])[:5]
        #    #print "TOP:", ", ".join(wc.keys()[x[0]] for x in top_words)
        #    print "BOTTOM WORDS:", ", ".join(wc.keys()[x[0]] for x in bottom_words)
        #    top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) [:3]
        #    print "TOP SENTS:", "\n".join([sentences[s[0]] for s in top_sents])

        topic = 0
        while len(summary_sentence_indices) < num_sentences:

            sent_weights = D[topic, :]
            top_sents = sorted(enumerate([s for s in sent_weights]),
                               key=lambda x: x[1])
            for sent in top_sents:
                if sent[0] > 0 and sent[0] not in summary_sentence_indices:
                    summary_sentence_indices.append(sent[0])
                    break

            topic += 1

        summary = ""
        summary_sentence_indices.sort()
        for i in summary_sentence_indices:
            summary += sentences[i] + "\n"
        return summary
コード例 #7
0
def calculate_tfidf(x_df: pd.DataFrame) -> pd.DataFrame:
    # Calculate the 'tf-idf' matrix of the 'text' column
    # Return the 'tf-idf' Dataframe
    tfidf_obj = TFIDF(x_df["text"])
    return tfidf_obj()
コード例 #8
0
 def __init__(self):
     self.table = TFIDF()
コード例 #9
0
def calculate_tfidf(x_df):
    # Calculate the 'tf-idf' matrix of the 'text' column
    # Return the 'tf-idf' Dataframe
    tfidf_obj = TFIDF(x_df['text'])
    return tfidf_obj()
コード例 #10
0
def calculate_tfidf(df_text):
    # Calculate the 'tf-idf' matrix of the 'text' column
    # Return the 'tf-idf' Dataframe
    tfidf_obj = TFIDF(df_text["text"])
    return tfidf_obj()
コード例 #11
0
ファイル: lsa.py プロジェクト: zachwooddoughty/summarize
class LSA:
    def __init__(self):
        self.TF = TFIDF()
        self.articles_dir = "articles/"
        self.summaries_dir = "summaries/"
        self.keywords_dir = "keywords/"

    def keywords(self, filename, num_topics=5, keywords_per_topic=3):
        text = ""
        with open(filename) as f:
            for line in f:
                text += line

        words = tokenize(text, "word", return_spans=False)
        sentences = tokenize(text, "sentence", return_spans=False)

        wc = {}
        clean_sentences = []
        for sent in sentences:
            clean_sent = {}
            for word in tokenize(sent, "word", return_spans=False):
                word = self.TF.clean(word)
                clean_sent[word] = 1
                wc[word] = wc.get(word, 0) + 1 
            clean_sentences.append(clean_sent)

        matrix = []
        for word in wc.keys():
            row = []
            for sent in clean_sentences:
                if word in sent:
                    row.append(self.TF.weight(word, wc[word]))
                else:
                    row.append(0)
            matrix.append(row)

        matrix = numpy.matrix(matrix)
        U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False)

        D = s * Vh

        keywords = []

        for topic in range(num_topics):
            try:
                words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])
            except IndexError:
                print "Problem indexing numpy array for", filename, "on topic", topic
                continue
            added = 0
            word_index = 0
            while added < keywords_per_topic and word_index < len(words):
                #print "Looking at", words[word_index], wc.keys()[words[word_index][0]]
                if wc.keys()[words[word_index][0]] not in keywords:
                    keywords.append(wc.keys()[words[word_index][0]])
                    added += 1
                word_index += 1

        return ", ".join(keywords)

    def summarize(self, filename):
        text = ""
        with open(filename) as f:
            for line in f:
                text += line

        words = tokenize(text, "word", return_spans=False)
        sentences = tokenize(text, "sentence", return_spans=False)

        wc = {}
        clean_sentences = []
        for sent in sentences:
            clean_sent = {}
            for word in tokenize(sent, "word", return_spans=False):
                word = self.TF.clean(word)
                clean_sent[word] = 1
                wc[word] = wc.get(word, 0) + 1 
            clean_sentences.append(clean_sent)

        matrix = []
        for word in wc.keys():
            #print "adding", word
            row = []
            for sent in clean_sentences:
                if word in sent:
                    row.append(self.TF.weight(word, wc[word]))
                else:
                    row.append(0)
            matrix.append(row)

        matrix = numpy.matrix(matrix)
        #print "matrix", matrix
        U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False)

    #    print "U", U
    #    print "s", s
    #    print "Vh", Vh
    #
        D = s * Vh
        #print "D", D

        num_sentences = 5
        summary_sentence_indices = []

        #for topic in range(3):
        #    print "Topic", topic
        #    sent_weights = D[topic,:]
        #    #top_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1], reverse=True)[:5]
        #    bottom_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])[:5]
        #    #print "TOP:", ", ".join(wc.keys()[x[0]] for x in top_words)
        #    print "BOTTOM WORDS:", ", ".join(wc.keys()[x[0]] for x in bottom_words)
        #    top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) [:3]
        #    print "TOP SENTS:", "\n".join([sentences[s[0]] for s in top_sents])

        topic = 0
        while len(summary_sentence_indices) < num_sentences:
            
            sent_weights = D[topic,:]
            top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) 
            for sent in top_sents:
                if sent[0] > 0 and sent[0] not in summary_sentence_indices:
                    summary_sentence_indices.append(sent[0])
                    break

            topic += 1
            
        summary = ""
        summary_sentence_indices.sort()
        for i in summary_sentence_indices:
            summary += sentences[i] + "\n"
        return summary
コード例 #12
0
ファイル: lsa.py プロジェクト: zachwooddoughty/summarize
 def __init__(self):
     self.TF = TFIDF()
     self.articles_dir = "articles/"
     self.summaries_dir = "summaries/"
     self.keywords_dir = "keywords/"
コード例 #13
0
from tf_idf import TFIDF, TFIDF_stop_word

print(
    "++++++++++++++++++++++++++++++++TEST Part I ++++++++++++++++++++++++++++++++"
)

# Testing
# class properties
corpus = TFIDF(lowercase=True, v_max=15, n_grams=2)
# corpus.fit(['a', 'b', 'c', 'd'])
# assert corpus.v_max == 4
# print(corpus.vocabulary)

# methods
s = [
    "This is a test: What do you want from me?", "nothing to clean here",
    ":?:mickey mouse!@.,#"
]
assert corpus.remove_bad_chars(s) == [
    "This is a test What do you want from me", "nothing to clean here",
    "mickey mouse!@#"
]

print("TEST 1:")
test_corpus = [
    'The hotel and the stay were great', 'This was a great stay',
    'Great stay in a great destination', 'Great destination'
]
corpus.fit(test_corpus)
print(f"vocabulary: {corpus.vocabulary}")