def read_keywords(self): """ Get keywords table. Try to read keywords table from self.keywords_path. yield: self.keywords: DataFrame contains Titles and Keywords, empty if file of keywords_path not exist.""" if not os.path.exists(self.keywords_path): logger.warning(f'Keywords path does not exist.') self.keywords = pd.DataFrame() else: self.keywords = pd.read_json(self.keywords_path)
def get_titles_by_keyword(self, keyword): """ Get titles by [keyword], output: titles: A list of titles of [keyword] """ # Return empty list if [keyword] not found if not keyword in self.keywords: logger.warning(f'Keyword {keyword} not found.') return [] # Get titles alltitles = self.keywords[keyword] return [e for e in alltitles[alltitles == 1].index]
def read_descriptions(self): """ Get descriptions table. Try to read descriptions table from self.descriptions_path. yield: self.descriptions: DataFrame contains Titles and Descriptions, empty if file of descriptions_path not exist.""" if not os.path.exists(self.descriptions_path): logger.warning(f'descriptions path does not exist.') self.descriptions = pd.DataFrame() else: self.descriptions = pd.read_excel(self.descriptions_path) self.descriptions = self.descriptions.set_index('Unnamed: 0', drop=True)
def get_keywords_by_title(self, title): """ Get keywords by [title], outputs: titles: A list of keywords of [title] """ # Return empty list if [title] not found if not title in self.keywords.index: logger.warning(f'Title {title} not found.') return [] # Get keywords keywordsTrans = self.keywords.T allkeywords = keywordsTrans[title] return [e for e in allkeywords[allkeywords == 1].index]
def papers_get_by_title(self, title, fields=['keywords', 'descriptions']): """ Get paper and its contents by [title]. outputs: A dict contains bits, keywords, descriptions in the format of json. None if failed. """ try: paper_contents = self.papers_server.get_by_title(title, fields=fields) return paper_contents except AssertionError as e: logger.warning( f'WORKER papers_get_by_title cannot get not existing title: {title}.' ) return None except Exception as e: logger.error(f'WORKER papers_get_by_title failed: {e}') return None