Example #1
0
    def run(self, data):
        results_file_name = "results/" + re.sub(".*/", "",
                                                data) + "_email_found.txt"
        data = pd.read_csv(data, sep="\t")
        colnames = list(data.columns) + ["邮箱", "作者信息", "作者机构", "文章标题", "查找方式"]
        data = data.values
        try:
            print("results stored in {}".format(results_file_name))
            done = self.__net.get_file_column(results_file_name)
            done = [list(i) for i in done]
        except Exception as e:
            print(e)
            print("did not found results table, create a new one")
            done = []

        bar = ProgressBar(data.shape[0])
        with open(results_file_name, mode='a', encoding='utf-8') as results:
            if not len(done):
                results.write('\t'.join(colnames) + '\n')
            for row in data:
                row = [str(i) for i in row]
                author, organ = (row[0].strip(), row[1].strip())
                if not self.array_in([author, organ], done):
                    email = self.iter_find_author_email(author, organ)
                    results.write('\t'.join(list(row) + email) + '\n')
                    done.append([author, organ])
                    results.flush()
                    bar.move()
                else:
                    bar.move()
Example #2
0
    def run(self, journal_list_file):
        base_path = os.path.dirname(os.path.abspath(__file__)) + '/'
        results_file_name = base_path + "results/" + \
            os.path.splitext(os.path.basename(journal_list_file))[
                0] + "_article_information.txt"
        done_journal_list_file_name = base_path + "data/done_journals.txt"
        with open(journal_list_file) as jl:
            journal_list = [
                jn.strip() for jn in jl.read().split('\n') if jn.strip()
            ]

        colnames = [
            '文章ID', '期刊名', '年份', '期', '论文题目', '作者', '机构', '关键词', '作者简介',
            '通讯作者姓名', '通讯作者邮箱', '通讯作者电话', '第一作者姓名', '第一作者邮箱', '第一作者电话'
        ]

        try:
            print("results stored in {}".format(results_file_name))
            done = self.__net.get_file_column(results_file_name, number=1)
            done = done.tolist()
        except Exception:
            # print(e)
            print("did not found results table, create a new one")
            done = []

        try:
            with open(done_journal_list_file_name) as djl:
                done_journal_list = [
                    jn.strip() for jn in djl.read().split('\n') if jn.strip()
                ]
        except Exception:
            done_journal_list = []

        bar = ProgressBar(len(journal_list))
        with open(results_file_name, mode='a',
                  encoding='utf-8') as results, open(
                      done_journal_list_file_name, mode='a',
                      encoding='utf-8') as done_journal_file:
            if not len(done):
                results.write('\t'.join(colnames) + '\n')
            for journal in journal_list:
                if journal not in done_journal_list:
                    for paper_id in self.all_journal_ids(journal):
                        if paper_id not in done:
                            found_info = self.find_article_info(paper_id)
                            results.write('\t'.join(found_info) + '\n')
                            done.append(paper_id)
                            # results.flush()
                    done_journal_file.write(journal + '\n')
                    done_journal_list.append(journal)
                    # done_journal_file.flush()
                    bar.move()
                else:
                    bar.move()