def concorde(self, args: List[str]): if not args: self.__wrong_arguments("concorde") return text = ' '.join(args) res: List[Tuple[Document, Tuple[str, str, str]]] = [] for c in self.corpuses.values(): res += c.concorde(text, self.__concorde_context_length) if not res: print("Nothing was found") return data = [] for doc, (left, mid, right) in res: data.append( [Helpers.truncate_str(doc.get_title()), left, mid, right]) df = DataFrame(data, columns=[ 'Document title', 'Left context', "Text", "Right context" ]) df.index += 1 print(df)
def preview(self, limit: int = 10) -> str: docs = self.__document_repository.preview(limit) table_str = "Nothing was found" if docs: df = DataFrame( [[x.get_date(), Helpers.truncate_str(x.get_title())] for x in docs], columns=['Date', 'Title']) df.index += 1 table_str = str(df) return f'Corpus "{self.name}":\n{table_str}\n{("." * 60)}'
def search(self, args: List[str]): if not args: self.__wrong_arguments("search") return text = ' '.join(args) res = [] for c in self.corpuses.values(): res += c.search(text) if not res: print("Nothing was found") return df = DataFrame( [[x.get_date(), Helpers.truncate_str(x.get_title())] for x in set(res)], columns=['Date', 'Document title']) df.index += 1 print(df)
def get_word_statistics(self): all_documents = self.get_documents() docs_words = [x.get_words() for x in all_documents] all_words_in_corpus = set( [item for sublist in docs_words for item in sublist]) freq = [] for word in all_words_in_corpus: documents_with_word = 0 # count of documents that contains current word word_total_freq = 0 # how many times this word appeared in documents word_count_in_documents = [] # statistics about current row for doc in all_documents: word_freq = doc.get_word_frequency(word) word_count_in_documents.append(word_freq) if word_freq > 0: documents_with_word += 1 # count if document contain this word word_total_freq += word_freq freq.append([word] + word_count_in_documents + [word_total_freq] + [documents_with_word]) statistic_headers = ['Word frequency', 'Documents with word'] document_headers = [ "Doc " + Helpers.truncate_str(key.get_title(), 16) for key in all_documents ] table_columns = ['Word'] + document_headers + statistic_headers df = DataFrame(freq, columns=table_columns) df = df.sort_values(statistic_headers, ascending=[False, False]).reset_index(drop=True) return df