def csv_to_iepy(filepath): print ('Importing Documents to IEPY from {}'.format(filepath)) from iepy.data.db import DocumentManager if filepath.endswith(".gz"): fin = gzip.open(filepath, "rt") else: fin = open(filepath, "rt") reader = csv.DictReader(fin) expected_fnames = ['document_id', 'document_text'] if not set(reader.fieldnames).issuperset(expected_fnames): msg = "Couldn't find the expected field names on the provided csv {}" sys.exit(msg.format(expected_fnames)) name = os.path.basename(filepath) docdb = DocumentManager() seen = set() for i, d in enumerate(reader): doc_id = d["document_id"] if doc_id in seen: continue seen.add(doc_id) docdb.create_document( identifier=doc_id, text=d["document_text"], metadata={"input_filename": name}, update_mode=True ) print ('Added {} documents'.format(i+1))
def csv_to_iepy(filepath): print ('Importing Documents to IEPY from {}'.format(filepath)) from iepy.data.db import DocumentManager if filepath.endswith(".gz"): fin = gzip.open(filepath, "rt") else: fin = open(filepath, "rt") reader = csv.DictReader(fin) expected_fnames = ['document_id', 'document_text'] if not set(reader.fieldnames).issuperset(expected_fnames): msg = "Couldn't find the expected field names on the provided csv {}" sys.exit(msg.format(expected_fnames)) name = os.path.basename(filepath) docdb = DocumentManager() seen = set() i = 0 while True: try: d = next(reader) except StopIteration: break except csv.Error as error: logger.warn("Couldn't load document: {}".format(error)) continue i += 1 doc_id = d["document_id"] if doc_id in seen: continue seen.add(doc_id) docdb.create_document( identifier=doc_id, text=d["document_text"], metadata={"input_filename": name}, update_mode=True ) print ('Added {} documents'.format(i))
def csv_to_iepy(filepath): print('Importing Documents to IEPY from {}'.format(filepath)) from iepy.data.db import DocumentManager if filepath.endswith(".gz"): fin = gzip.open(filepath, "rt") else: fin = open(filepath, "rt") reader = csv.DictReader(fin) expected_fnames = ['document_id', 'document_text'] if not set(reader.fieldnames).issuperset(expected_fnames): msg = "Couldn't find the expected field names on the provided csv {}" sys.exit(msg.format(expected_fnames)) name = os.path.basename(filepath) docdb = DocumentManager() seen = set() i = 0 while True: try: d = next(reader) except StopIteration: break except csv.Error as error: logger.warn("Couldn't load document: {}".format(error)) continue i += 1 doc_id = d["document_id"] if doc_id in seen: continue seen.add(doc_id) docdb.create_document(identifier=doc_id, text=d["document_text"], metadata={"input_filename": name}, update_mode=True) print('Added {} documents'.format(i))
def __init__(self, step_runners, documents_manager): """Takes a list of callables and a documents-manager. Step Runners may be any callable. It they have an attribute step, then that runner will be treated as the responsible for accomplishing such a PreProcessStep. """ from iepy.data.db import DocumentManager # circular imports safety self.step_runners = step_runners if not isinstance(documents_manager, DocumentManager): documents_manager = DocumentManager(documents_manager) self.documents = documents_manager
def csv_to_iepy(filepath): logger.info('Importing Documents to IEPY from {}'.format(filepath)) from iepy.data.db import DocumentManager if filepath.endswith(".gz"): fin = gzip.open(filepath, "rt") else: fin = open(filepath, "rt") reader = csv.DictReader(fin) name = os.path.basename(filepath) docdb = DocumentManager() seen = set() for i, d in enumerate(reader): mid = d["freebase_mid"] if mid in seen: continue seen.add(mid) docdb.create_document(identifier=mid, text=d["description"], metadata={"input_filename": name}) logger.info('Added {} documents'.format(i + 1))
def test_process_step_in_batch_filter_docs_to_apply_if_has_attr_step(self): step_runner = mock.MagicMock(step=PreProcessSteps.tokenization, override=False, increment=False) all_docs = [object() for i in range(5)] self.patch_object(DocumentManager, '__iter__', return_value=all_docs) dm_get_docs = self.patch_object(DocumentManager, 'get_documents_lacking_preprocess', return_value=all_docs[:2]) # Ok, docs manager has 5 docs, but get_documents_lacking_preprocess will return # only 2 of them p = PreProcessPipeline([step_runner], DocumentManager()) p.process_step_in_batch(step_runner) dm_get_docs.assert_called_once_with(step_runner.step) self.assertNotEqual(step_runner.call_count, 5) self.assertEqual(step_runner.call_count, 2) self.assertEqual(step_runner.call_args_list, [mock.call(d) for d in all_docs[:2]])
class TestDocumentCreationThruManager(ManagerTestCase): sample_id = 'sample-id' sample_text = 'this is a sample text' sample_metadata = {'iepy': 'rocks'} docmanager = DocumentManager() def test_create_basic(self): doc = self.docmanager.create_document(self.sample_id, self.sample_text, self.sample_metadata) self.assertEqual(doc.human_identifier, self.sample_id) self.assertEqual(doc.text, self.sample_text) self.assertEqual(doc.metadata, self.sample_metadata) self.assertEqual(IEDocument.objects.count(), 1) def test_create_existent_does_nothing(self): doc = self.docmanager.create_document(self.sample_id, self.sample_text, self.sample_metadata) doc2 = self.docmanager.create_document(self.sample_id, self.sample_text, self.sample_metadata) self.assertEqual(doc, doc2) self.assertEqual(IEDocument.objects.count(), 1) def test_doc_text_and_metadata_are_updated_if_enabled(self): new_text = self.sample_text + ' but longer' new_metadata = {'something': 'different'} self.docmanager.create_document(self.sample_id, self.sample_text, self.sample_metadata) doc = self.docmanager.create_document(self.sample_id, new_text, new_metadata) self.assertNotEqual(doc.text, new_text) self.assertEqual(doc.text, self.sample_text) self.assertNotEqual(doc.metadata, new_metadata) self.assertEqual(doc.metadata, self.sample_metadata) doc = self.docmanager.create_document(self.sample_id, new_text, new_metadata, update_mode=True) self.assertEqual(doc.text, new_text) self.assertEqual(doc.metadata, new_metadata)
from docopt import docopt from iepy.data.db import DocumentManager if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") opts = docopt(__doc__, version=0.1) name = opts["<filename>"] if name.endswith(".gz"): fin = gzip.open(name, "rt") else: fin = open(name, "rt") reader = csv.DictReader(fin) name = os.path.basename(name) docdb = DocumentManager() seen = set() for i, d in enumerate(reader): mid = d["freebase_mid"] if mid in seen: continue seen.add(mid) docdb.create_document(identifier=mid, text=d["description"], metadata={"input_filename": name})
preprocess.py -h | --help | --version Options: -h --help Show this screen --version Version number """ import logging from docopt import docopt import iepy iepy.setup(__file__) from iepy.data.db import DocumentManager from iepy.preprocess.stanford_preprocess import StanfordPreprocess from iepy.preprocess.pipeline import PreProcessPipeline from iepy.preprocess.segmenter import SyntacticSegmenterRunner if __name__ == '__main__': logger = logging.getLogger(u'preprocess') logger.setLevel(logging.INFO) logging.basicConfig( level=logging.INFO, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") opts = docopt(__doc__, version=0.1) docs = DocumentManager() pipeline = PreProcessPipeline( [StanfordPreprocess(), SyntacticSegmenterRunner(increment=True)], docs) pipeline.process_everything()
def get_episode(pages_dict, number_of_seasons, all_tag, season_tag_pattern): candidates = [pa for pa in pages_dict.values() if has_category_tag(pa, all_tag)] per_season = [] for i in range(1, number_of_seasons + 1): season_tag = season_tag_pattern % i season_ep = [pa for pa in candidates if has_category_tag(pa, season_tag)] per_season.append(season_ep) return per_season if __name__ == '__main__': logging.basicConfig() logger = logging.getLogger('wikia_to_iepy') logger.setLevel(logging.DEBUG) opts = docopt(__doc__, version=0.1) docs = DocumentManager() pages_dict = build_pages_dict(opts['<wikia_zipped_xml_dump_file>']) eps = get_episode(pages_dict, int(opts['<nr_of_seasons>']), opts['--all-episodes-tag'], opts['--season-tag-pattern']) for season_nr, season in enumerate(eps, 1): issues_counter = 0 for i, e in enumerate(season): try: docs.create_document( identifier=e['title'], text='', metadata={ 'raw_text': e['revision']['text']['#text'], 'season': season_nr, 'source': opts['<wikia_zipped_xml_dump_file>']
from docopt import docopt from iepy.data.db import DocumentManager if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") opts = docopt(__doc__, version=0.1) name = opts["<filename>"] if name.endswith(".gz"): fin = gzip.open(name, "rt") else: fin = open(name, "rt") reader = csv.DictReader(fin) name = os.path.basename(name) docdb = DocumentManager() seen = set() for i, d in enumerate(reader): mid = d["freebase_mid"] if mid in seen: continue seen.add(mid) docdb.create_document(identifier=mid, text=d["description"], metadata={"input_filename": name})