def fuzzy_find(self, keywords, threshold=50, max_return=10, exclude=None, by_index=True): """Find a record using keywords. It looks for keywords in the title/authors/keywords (for as much is available). Using the diflib package it creates a ranking based on token set matching. Arguments --------- keywords: str A string of keywords together, can be a combination. threshold: float Don't return records below this threshold. max_return: int Maximum number of records to return. exclude: list, np.ndarray List of indices that should be excluded in the search. You would put papers that were already labeled here for example. by_index: bool If True, use internal indexing. If False, use record ids for indexing. Returns ------- list: Sorted list of indexes that match best the keywords. """ match_str = np.full(len(self), "x", dtype=object) all_titles = self.title all_authors = self.authors all_keywords = self.keywords for i in range(len(self)): match_list = [] if all_authors is not None: match_list.append(format_to_str(all_authors[i])) match_list.append(all_titles[i]) if all_keywords is not None: match_list.append(format_to_str(all_keywords[i])) match_str[i, ] = " ".join(match_list) new_ranking = get_fuzzy_scores(keywords, match_str) sorted_idx = np.argsort(-new_ranking) best_idx = [] if exclude is None: exclude = np.array([], dtype=int) for idx in sorted_idx: if ((not by_index and self.df.index.values[idx] in exclude) or by_index and idx in exclude): continue if len(best_idx) >= max_return: break if len(best_idx) > 0 and new_ranking[idx] < threshold: break best_idx.append(idx) fuzz_idx = np.array(best_idx, dtype=np.int) if not by_index: fuzz_idx = self.df.index.values[fuzz_idx] return fuzz_idx.tolist()
def match_string(self): match_str = np.full(len(self), "x", dtype=object) all_titles = self.title all_authors = self.authors all_keywords = self.keywords for i in range(len(self)): match_list = [] if all_authors is not None: match_list.append(format_to_str(all_authors[i])) match_list.append(all_titles[i]) if all_keywords is not None: match_list.append(format_to_str(all_keywords[i])) match_str[i, ] = " ".join(match_list) return match_str
def format(self, use_cli_colors=True): """Format one record for displaying in the CLI. Arguments --------- use_cli_colors: bool Some terminals support colors, set to True to use them. Returns ------- str: A string including title, abstracts and authors. """ if self.title is not None: title = self.title if use_cli_colors: title = "\033[95m" + title + "\033[0m" title += "\n" else: title = "" if self.authors is not None and len(self.authors) > 0: authors = format_to_str(self.authors) + "\n" else: authors = "" if self.abstract is not None and len(self.abstract) > 0: abstract = self.abstract abstract = "\n" + abstract + "\n" else: abstract = "" return ("\n\n----------------------------------" f"\n{title}{authors}{abstract}" "----------------------------------\n\n")
def preview_record(record, w_title=80, w_authors=40, automatic_width=False): """Return a single line preview string for record i. Arguments --------- record: PaperRecord The paperRecord to preview. w_title: int Width to be allocated for the title of the paper. w_authors: int Width to be allocated for the authors of the paper. automatic_width: bool If true, compute w_title, w_authors from the console width. Returns ------- str: A string that previews a paper record. """ if automatic_width: term_width = os.get_terminal_size().columns width_available = term_width - 7 w_title = round((2 / 3) * width_available) w_authors = width_available - w_title title_str = "" author_str = "" heading = record.title if heading is None: heading = record.abstract if heading is not None: if len(heading) > w_title: title_str = heading[:w_title - 2] + ".." else: title_str = heading if record.authors is not None: cur_authors = format_to_str(record.authors) if len(cur_authors) > w_authors: author_str = cur_authors[:w_authors - 2] + ".." else: author_str = cur_authors format_str = "{0: <" + str(w_title) + "} " + "{1: <" + str(w_authors) format_str += "}" prev_str = format_str.format(title_str, author_str) return prev_str