def run(self, data): results_file_name = "results/" + re.sub(".*/", "", data) + "_email_found.txt" data = pd.read_csv(data, sep="\t") colnames = list(data.columns) + ["邮箱", "作者信息", "作者机构", "文章标题", "查找方式"] data = data.values try: print("results stored in {}".format(results_file_name)) done = self.__net.get_file_column(results_file_name) done = [list(i) for i in done] except Exception as e: print(e) print("did not found results table, create a new one") done = [] bar = ProgressBar(data.shape[0]) with open(results_file_name, mode='a', encoding='utf-8') as results: if not len(done): results.write('\t'.join(colnames) + '\n') for row in data: row = [str(i) for i in row] author, organ = (row[0].strip(), row[1].strip()) if not self.array_in([author, organ], done): email = self.iter_find_author_email(author, organ) results.write('\t'.join(list(row) + email) + '\n') done.append([author, organ]) results.flush() bar.move() else: bar.move()
def run(self, journal_list_file): base_path = os.path.dirname(os.path.abspath(__file__)) + '/' results_file_name = base_path + "results/" + \ os.path.splitext(os.path.basename(journal_list_file))[ 0] + "_article_information.txt" done_journal_list_file_name = base_path + "data/done_journals.txt" with open(journal_list_file) as jl: journal_list = [ jn.strip() for jn in jl.read().split('\n') if jn.strip() ] colnames = [ '文章ID', '期刊名', '年份', '期', '论文题目', '作者', '机构', '关键词', '作者简介', '通讯作者姓名', '通讯作者邮箱', '通讯作者电话', '第一作者姓名', '第一作者邮箱', '第一作者电话' ] try: print("results stored in {}".format(results_file_name)) done = self.__net.get_file_column(results_file_name, number=1) done = done.tolist() except Exception: # print(e) print("did not found results table, create a new one") done = [] try: with open(done_journal_list_file_name) as djl: done_journal_list = [ jn.strip() for jn in djl.read().split('\n') if jn.strip() ] except Exception: done_journal_list = [] bar = ProgressBar(len(journal_list)) with open(results_file_name, mode='a', encoding='utf-8') as results, open( done_journal_list_file_name, mode='a', encoding='utf-8') as done_journal_file: if not len(done): results.write('\t'.join(colnames) + '\n') for journal in journal_list: if journal not in done_journal_list: for paper_id in self.all_journal_ids(journal): if paper_id not in done: found_info = self.find_article_info(paper_id) results.write('\t'.join(found_info) + '\n') done.append(paper_id) # results.flush() done_journal_file.write(journal + '\n') done_journal_list.append(journal) # done_journal_file.flush() bar.move() else: bar.move()