Ejemplo n.º 1
0
class IR:
    def __init__(self,
                 documents: List,
                 stop_words: Set,
                 stemming: bool = True):
        self.documents = documents
        self.stop_words = stop_words
        self.stemming = stemming
        self.proccess_documents(self.documents)
        self.dictionary = None
        self.corpus = None
        self.tfidf_model = None
        self.tfidf_corpus = None
        self.tfidf_similarity = None
        self.lsi_model = None
        self.lsi_corpus = None
        self.lsi_similarity = None
        self.build_dictionary()
        self.build_models()

    """ 1. Data loading and preprocessing """

    def proccess_documents(self, documents: List) -> List:
        # Tokenize documents
        result = self.tokenize(documents)
        # Lowercase all words
        result = list(map(lambda x: self.lowercase(x), result))
        # Remove stop words
        result = self.filter_stopwords(result)
        # Remove text punctuation
        result = self.remove_text_punctation(result)
        # Stem words
        if self.stemming:
            result = self.port_stem(result)
        # Remove empty words from all documents
        return self.filter_empty_words(result)

    """ 2. Dictionary building """

    def build_dictionary(self):
        documents = self.proccess_documents(self.documents)
        self.dictionary = Dictionary(documents)
        self.corpus = list(
            map(lambda doc: self.dictionary.doc2bow(doc), documents))

    """ 3. Retrieval Models """

    def build_models(self):
        # Create tfidf model
        self.tfidf_model = TfidfModel(self.corpus)

        # Map bag of words to (word-index, word-weight)
        self.tfidf_corpus = list(
            map(lambda c: self.tfidf_model[c], self.corpus))

        self.tfidf_similarity = MatrixSimilarity(self.tfidf_corpus)

        self.lsi_model = LsiModel(self.tfidf_corpus,
                                  id2word=self.dictionary,
                                  num_topics=100)

        self.lsi_corpus = list(
            map(lambda c: self.lsi_model[c], self.tfidf_corpus))

        self.lsi_similarity = MatrixSimilarity(self.lsi_corpus)

    def filter_stopwords(self, paragraphs: List) -> List:
        return list(
            map(lambda p: list(filter(lambda x: x not in self.stop_words, p)),
                paragraphs))

    """ 4. Querying  """

    def procces_query(self, query: str) -> List:
        tokenized = self.tokenize([query])
        lowered = list(map(lambda x: self.lowercase(x), tokenized))
        stop_word_filtered = self.filter_stopwords(lowered)
        punctation_filtered = self.remove_text_punctation(stop_word_filtered)
        if self.stemming:
            return self.port_stem(punctation_filtered)
        return punctation_filtered

    def tfidf_query(self, query: str, number_of_results: int = 3) -> None:
        # Proccess query
        proccessed_query = self.procces_query(query)
        query_corpus = self.dictionary.doc2bow(proccessed_query[0])
        query_tfidf = self.tfidf_model[query_corpus]
        similarity = enumerate(self.tfidf_similarity[query_tfidf])
        # Query most relevant paragraphs using TFIDF model
        query_result = sorted(similarity,
                              key=lambda kv: -kv[1])[:number_of_results]
        # Print search result
        for result in query_result:
            number, _ = result
            print("Paragraph:", number)
            print(self.documents[number], "\n")

    def lsi_query(self, query: str, number_of_results: int = 3) -> None:
        # Proccess query
        proccessed_query = self.procces_query(query)
        query_corpus = self.dictionary.doc2bow(proccessed_query[0])
        tfidf_query = self.tfidf_model[query_corpus]
        lsi_query = self.lsi_model[tfidf_query]

        # Fetch most relevant topics
        relevant_topics = sorted(
            lsi_query, key=lambda kv: -abs(kv[1]))[:number_of_results]

        # Print
        for result in relevant_topics:
            number, _ = result
            print("Topic:", number)
            print(self.lsi_model.show_topic(number))
        print()

        # Find most relevant paragraphs using LSI similarity
        doc2similarity = enumerate(self.lsi_similarity[lsi_query])
        query_result = sorted(doc2similarity,
                              key=lambda kv: -kv[1])[:number_of_results]
        # Print query result
        for result in query_result:
            number, _ = result
            print("Paragraph:", number)
            print(self.documents[number], "\n")

    """ All methods below is helpers to preproccess both documents and queries. """

    @staticmethod
    def filter_empty_words(paragraphs: List) -> List:
        return list(
            map(lambda p: list(filter(lambda w: w != "", p)), paragraphs))

    @staticmethod
    def tokenize(documents: List) -> List:
        return list(map(lambda x: x.split(), documents))

    @staticmethod
    def lowercase(words: List) -> List:
        return list(map(lambda s: s.lower(), words))

    @staticmethod
    def port_stem(documents: List) -> List:
        stemmer = PorterStemmer()
        return list(
            map(lambda p: list(map(lambda w: stemmer.stem(w), p)), documents))

    @staticmethod
    def remove_text_punctation(documents: List) -> List:
        regular = "[" + string.punctuation + "\n\r\t" + "]"
        return list(
            map(lambda p: list(map(lambda w: re.sub(regular, "", w), p)),
                documents))
lsi.save(os.path.join(DATA_PATH, 'lsi100'))
lsi2.save(os.path.join(DATA_PATH, 'lsi2'))


# In[16]:

lsi2.show_topics()


# In[23]:

# for topic in lsi.show_topics():
#     print(topic)

lsi.show_topic(0, 100)


# ## Hold onto your hat
# This will take a lot of RAM!  
# (and CPU)  

# In[31]:

tweetids = pd.Series(range(len(bows)), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
# `dict()` keeps track of the columns for each topic, in case the lsi model shuffles or skips topics for odd tweets
df = pd.DataFrame([pd.Series(dict(lsi[bows[i]]), name='tweet') for i in tweetids],
                  columns=topicids,
                  index=tweetids)