def get_libgen_articles(libgen_folder): with open(os.path.join(libgen_folder, 'pmc_metadata.tsv')) as f: raw_metadata = [line.strip().split('\t') for line in f.readlines()] metadata = {PMC: (title, journal) for PMC, title, journal in raw_metadata} filenames = os.listdir(os.path.join(libgen_folder, 'txt')) filepathes = [os.path.join(libgen_folder, 'txt', _file) for _file in filenames] for filename, filepath in zip(filenames, filepathes): PMC = filename.split('.')[0] pmc_numbers = remove_pmc_from_pmcid(PMC) title, journal = metadata[PMC] with open(filepath) as f: file_lines = [line.strip('\n- ') for line in f.readlines()] text = ' '.join(file_lines) text = re.sub('\s+', ' ', text) yield {'text': text, 'title': title, 'journal': journal, 'pmc': pmc_numbers}
def get_articles(self): medline_records = get_medline_records(self.medline_file) for medline_record in medline_records: if 'AB' in medline_record: text = ''.join(medline_record['AB']) else: text = '' if 'TI' in medline_record: title = ''.join(medline_record['TI']) else: title = '' if 'JT' in medline_record: journal = ''.join(medline_record['JT']) else: journal = '' if 'PMC' in medline_record: pmc = ''.join(medline_record['PMC']) if len(pmc) > 0: pmc = remove_pmc_from_pmcid(pmc) yield Article(title, text, journal, pmc)
def test_remove_pmc_from_pmcid(self): actual = remove_pmc_from_pmcid("PMC123456") self.assertEqual(actual, "123456")