def process_article(article, terms=None, entity_type=Idea, output_filename=None, corpus_root='corpus/'): if terms is None: terms = select_terms(entity_type) lines = [] filename = article.get_filename(corpus_root) if filename and os.path.isfile(filename): print "processing:", article.sep_dir, filename try: doc = extract_article_body(filename) lines = dm.prepare_apriori_input(doc, terms, article) except: print "ERROR PROCESSING:", article.sep_dir, filename else: print "BAD SEP_DIR:", article.sep_dir if output_filename: with open(output_filename, 'w') as f: f.writelines(lines) else: return lines
def filter_apriori_input(occur_filename, output_filename, entity_type=Idea, doc_terms=None): #select terms terms = select_terms(entity_type) Session.expunge_all() Session.close() lines = dm.prepare_apriori_input(occur_filename, terms, doc_terms) with open(output_filename, 'w') as f: f.writelines(lines)