def output_pure_article(self, article, subject):
     self.article = os.path.join(self.inPath, subject, article)
     self.tree = Articles.parse_XML_no_Table(self.article)
     self.body = self.get_article_body_text(self.tree)
     self.sentences = self.tokenize_text(self.body)
     self.doi = Articles.extract_element(self.tree, "front/article-meta/article-id")
     if not os.path.exists(os.path.join(self.outPath, subject)):
         sys.stderr.write("creating directory %s\n" % os.path.join(self.outPath, subject))
         os.mkdir(os.path.join(self.outPath, subject))
     
     self.write(os.path.join(self.outPath, subject))
Exemple #2
0
def collect_files_by_DOI(DOI_dict, archive_dir, data_dir, raw_article_dir, updater = 200):
    """
        iterates over all archived articles and copies files to subject dirs
        if article DOIs are in the DOI list.  
    """  
    archived_articles = generate_files(archive_dir)
    article_trees = ((Articles.parse_XML(article), article) for article in archived_articles)
    article_IDs = ((list(Articles.extract_element(tree[0],
                "front/article-meta/article-id")), tree[1]) for tree in article_trees)
    found_DOIs = dict(((key, {}) for key in settings.DOI_LISTS.keys()))
    for n, ID in enumerate(article_IDs):
        found_DOIs = check_and_copy_DOI(DOI_dict, ID, data_dir, raw_article_dir, found_DOIs)
        if n % updater == 0:
            sys.stderr.write( "%d files processed...\n" % n)
Exemple #3
0
from pmcminer.ProcessFiles import Raw, Articles
from  control import settings

def check_DOI_by_dict(DOI_dict, article_ID, data_dir, raw_article_dir):
    """generate subject, DOI and filename for PMC XML file"""
    for subject in DOI_dict:
        for item in article_ID[0]:
            if item in DOI_dict[subject]:
                yield (subject, item, article_ID[1])
                

DOIs = Raw.get_DOI_list(settings.DOI_LISTS)

archived_articles = Raw.generate_files(settings.ARCHIVE_DIR)
article_trees = ((Articles.parse_XML(article), article) for article in archived_articles)
article_IDs = ((list(Articles.extract_element(tree[0],
                "front/article-meta/article-id")), tree[1]) for tree in article_trees)

check_dict = dict(((key, {}) for key in settings.DOI_LISTS.keys()))

#for ID in article_IDs:
for n, ID in enumerate(article_IDs):
    checker = check_DOI_by_dict(DOIs, ID, settings.DATA_DIR, settings.RAW_ARTICLES_DIR)
    for i in checker:
        try:
            check_dict[i[0]][i[1]].append(i[2])
        except KeyError:
            check_dict[i[0]][i[1]] = [i[2]]
    if n % 500 == 0:
        sys.stderr.write( "%d files processed...\n" % n)