def scrub_articles_from_root(source_dir, stops, updater = 500): """ Runs through a whole directory, opens all files and removes newlines matching stops. Then saves back to the same file. """ sys.stderr.write("Scrubbing extraneous newlines...\n") articles = Raw.generate_files(source_dir) article_strings = ({ "file": codecs.open(article, "r", "utf8").read(), "path": article } for article in articles) scrubbed_articles = ({ "file":scrub_article(a["file"], stops), "path":a["path"] } for a in article_strings) for n, scrubbed in enumerate(scrubbed_articles): fOut = codecs.open(scrubbed["path"], "w", "utf8") fOut.write(scrubbed["file"]) fOut.close() if n % updater == 0: sys.stderr.write("%d articles scrubbed..." % n) sys.stderr.write("All scrubbed up.\n")
import os from pmcminer.ProcessFiles import Raw, Articles from control import settings def check_DOI_by_dict(DOI_dict, article_ID, data_dir, raw_article_dir): """generate subject, DOI and filename for PMC XML file""" for subject in DOI_dict: for item in article_ID[0]: if item in DOI_dict[subject]: yield (subject, item, article_ID[1]) DOIs = Raw.get_DOI_list(settings.DOI_LISTS) archived_articles = Raw.generate_files(settings.ARCHIVE_DIR) article_trees = ((Articles.parse_XML(article), article) for article in archived_articles) article_IDs = ((list(Articles.extract_element(tree[0], "front/article-meta/article-id")), tree[1]) for tree in article_trees) check_dict = dict(((key, {}) for key in settings.DOI_LISTS.keys())) #for ID in article_IDs: for n, ID in enumerate(article_IDs): checker = check_DOI_by_dict(DOIs, ID, settings.DATA_DIR, settings.RAW_ARTICLES_DIR) for i in checker: try: check_dict[i[0]][i[1]].append(i[2]) except KeyError: check_dict[i[0]][i[1]] = [i[2]] if n % 500 == 0:
import os import sys import itertools from control import settings from pmcminer.ProcessFiles import Articles from pmcminer.ProcessFiles import Raw articles = ({ "articles": Raw.generate_files(os.path.join(settings.DATA_DIR, settings.RAW_ARTICLES_DIR, subject)), "subject": subject} for subject in settings.DOI_LISTS) class Purify(Articles.ExtractArticleBody): """ builds an article set for article subjects that are non-overlapping. put a tuple or list in dirSet to specify directories within inPath. """ def __init__(self, inPath, outPath, dirSet = False): if dirSet: self.subjects = dirSet else: self.subjects = [subject for subject in os.listdir(inPath) if os.path.isdir(os.path.join(inPath, subject))] self.subjectCount = [len(os.listdir(os.path.join(inPath, subject))) for subject in self.subjects]