def get_libgen_articles(libgen_folder):
    with open(os.path.join(libgen_folder, 'pmc_metadata.tsv')) as f:
        raw_metadata = [line.strip().split('\t') for line in f.readlines()]
        metadata = {PMC: (title, journal) for PMC, title, journal in raw_metadata}

    filenames = os.listdir(os.path.join(libgen_folder, 'txt'))
    filepathes = [os.path.join(libgen_folder, 'txt', _file) for _file in filenames]

    for filename, filepath in zip(filenames, filepathes):
        PMC = filename.split('.')[0]
        pmc_numbers = remove_pmc_from_pmcid(PMC)
        title, journal = metadata[PMC]
        with open(filepath) as f:
            file_lines = [line.strip('\n- ') for line in f.readlines()]
            text = ' '.join(file_lines)
            text = re.sub('\s+', ' ', text)
        yield {'text': text, 'title': title, 'journal': journal, 'pmc': pmc_numbers}
 def get_articles(self):
     medline_records = get_medline_records(self.medline_file)
     for medline_record in medline_records:
         if 'AB' in medline_record:
             text = ''.join(medline_record['AB'])
         else:
             text = ''
         if 'TI' in medline_record:
             title = ''.join(medline_record['TI'])
         else:
             title = ''
         if 'JT' in medline_record:
             journal = ''.join(medline_record['JT'])
         else:
             journal = ''
         if 'PMC' in medline_record:
             pmc = ''.join(medline_record['PMC'])
             if len(pmc) > 0:
                 pmc = remove_pmc_from_pmcid(pmc)
         yield Article(title, text, journal, pmc)
 def test_remove_pmc_from_pmcid(self):
     actual = remove_pmc_from_pmcid("PMC123456")
     self.assertEqual(actual, "123456")