Exemple #1
0
    def concorde(self, args: List[str]):
        if not args:
            self.__wrong_arguments("concorde")
            return

        text = ' '.join(args)

        res: List[Tuple[Document, Tuple[str, str, str]]] = []

        for c in self.corpuses.values():
            res += c.concorde(text, self.__concorde_context_length)

        if not res:
            print("Nothing was found")
            return

        data = []

        for doc, (left, mid, right) in res:
            data.append(
                [Helpers.truncate_str(doc.get_title()), left, mid, right])

        df = DataFrame(data,
                       columns=[
                           'Document title', 'Left context', "Text",
                           "Right context"
                       ])

        df.index += 1
        print(df)
Exemple #2
0
    def preview(self, limit: int = 10) -> str:
        docs = self.__document_repository.preview(limit)

        table_str = "Nothing was found"

        if docs:
            df = DataFrame(
                [[x.get_date(),
                  Helpers.truncate_str(x.get_title())] for x in docs],
                columns=['Date', 'Title'])
            df.index += 1
            table_str = str(df)

        return f'Corpus "{self.name}":\n{table_str}\n{("." * 60)}'
Exemple #3
0
    def search(self, args: List[str]):
        if not args:
            self.__wrong_arguments("search")
            return

        text = ' '.join(args)

        res = []

        for c in self.corpuses.values():
            res += c.search(text)

        if not res:
            print("Nothing was found")
            return

        df = DataFrame(
            [[x.get_date(), Helpers.truncate_str(x.get_title())]
             for x in set(res)],
            columns=['Date', 'Document title'])
        df.index += 1

        print(df)
Exemple #4
0
    def get_word_statistics(self):
        all_documents = self.get_documents()
        docs_words = [x.get_words() for x in all_documents]

        all_words_in_corpus = set(
            [item for sublist in docs_words for item in sublist])

        freq = []

        for word in all_words_in_corpus:
            documents_with_word = 0  # count of documents that contains current word
            word_total_freq = 0  # how many times this word appeared in documents

            word_count_in_documents = []  # statistics about current row
            for doc in all_documents:
                word_freq = doc.get_word_frequency(word)
                word_count_in_documents.append(word_freq)

                if word_freq > 0:
                    documents_with_word += 1  # count if document contain this word

                word_total_freq += word_freq

            freq.append([word] + word_count_in_documents + [word_total_freq] +
                        [documents_with_word])

        statistic_headers = ['Word frequency', 'Documents with word']
        document_headers = [
            "Doc " + Helpers.truncate_str(key.get_title(), 16)
            for key in all_documents
        ]
        table_columns = ['Word'] + document_headers + statistic_headers
        df = DataFrame(freq, columns=table_columns)
        df = df.sort_values(statistic_headers,
                            ascending=[False, False]).reset_index(drop=True)
        return df