Example #1
0
File: utils.py Project: lowks/iepy
def csv_to_iepy(filepath):
    print ('Importing Documents to IEPY from {}'.format(filepath))
    from iepy.data.db import DocumentManager

    if filepath.endswith(".gz"):
        fin = gzip.open(filepath, "rt")
    else:
        fin = open(filepath, "rt")
    reader = csv.DictReader(fin)

    expected_fnames = ['document_id', 'document_text']
    if not set(reader.fieldnames).issuperset(expected_fnames):
        msg = "Couldn't find the expected field names on the provided csv {}"
        sys.exit(msg.format(expected_fnames))

    name = os.path.basename(filepath)

    docdb = DocumentManager()
    seen = set()
    for i, d in enumerate(reader):
        doc_id = d["document_id"]
        if doc_id in seen:
            continue
        seen.add(doc_id)
        docdb.create_document(
            identifier=doc_id,
            text=d["document_text"],
            metadata={"input_filename": name},
            update_mode=True
        )
        print ('Added {} documents'.format(i+1))
Example #2
0
File: utils.py Project: 52nlp/iepy
def csv_to_iepy(filepath):
    print ('Importing Documents to IEPY from {}'.format(filepath))
    from iepy.data.db import DocumentManager

    if filepath.endswith(".gz"):
        fin = gzip.open(filepath, "rt")
    else:
        fin = open(filepath, "rt")
    reader = csv.DictReader(fin)

    expected_fnames = ['document_id', 'document_text']
    if not set(reader.fieldnames).issuperset(expected_fnames):
        msg = "Couldn't find the expected field names on the provided csv {}"
        sys.exit(msg.format(expected_fnames))

    name = os.path.basename(filepath)

    docdb = DocumentManager()
    seen = set()

    i = 0
    while True:

        try:
            d = next(reader)
        except StopIteration:
            break
        except csv.Error as error:
            logger.warn("Couldn't load document: {}".format(error))
            continue

        i += 1

        doc_id = d["document_id"]
        if doc_id in seen:
            continue
        seen.add(doc_id)
        docdb.create_document(
            identifier=doc_id,
            text=d["document_text"],
            metadata={"input_filename": name},
            update_mode=True
        )
        print ('Added {} documents'.format(i))
Example #3
0
def csv_to_iepy(filepath):
    print('Importing Documents to IEPY from {}'.format(filepath))
    from iepy.data.db import DocumentManager

    if filepath.endswith(".gz"):
        fin = gzip.open(filepath, "rt")
    else:
        fin = open(filepath, "rt")
    reader = csv.DictReader(fin)

    expected_fnames = ['document_id', 'document_text']
    if not set(reader.fieldnames).issuperset(expected_fnames):
        msg = "Couldn't find the expected field names on the provided csv {}"
        sys.exit(msg.format(expected_fnames))

    name = os.path.basename(filepath)

    docdb = DocumentManager()
    seen = set()

    i = 0
    while True:

        try:
            d = next(reader)
        except StopIteration:
            break
        except csv.Error as error:
            logger.warn("Couldn't load document: {}".format(error))
            continue

        i += 1

        doc_id = d["document_id"]
        if doc_id in seen:
            continue
        seen.add(doc_id)
        docdb.create_document(identifier=doc_id,
                              text=d["document_text"],
                              metadata={"input_filename": name},
                              update_mode=True)
        print('Added {} documents'.format(i))
Example #4
0
    def __init__(self, step_runners, documents_manager):
        """Takes a list of callables and a documents-manager.

            Step Runners may be any callable. It they have an attribute step,
            then that runner will be treated as the responsible for
            accomplishing such a PreProcessStep.
        """
        from iepy.data.db import DocumentManager  # circular imports safety
        self.step_runners = step_runners
        if not isinstance(documents_manager, DocumentManager):
            documents_manager = DocumentManager(documents_manager)
        self.documents = documents_manager
Example #5
0
def csv_to_iepy(filepath):
    logger.info('Importing Documents to IEPY from {}'.format(filepath))
    from iepy.data.db import DocumentManager

    if filepath.endswith(".gz"):
        fin = gzip.open(filepath, "rt")
    else:
        fin = open(filepath, "rt")
    reader = csv.DictReader(fin)
    name = os.path.basename(filepath)

    docdb = DocumentManager()
    seen = set()
    for i, d in enumerate(reader):
        mid = d["freebase_mid"]
        if mid in seen:
            continue
        seen.add(mid)
        docdb.create_document(identifier=mid,
                              text=d["description"],
                              metadata={"input_filename": name})
        logger.info('Added {} documents'.format(i + 1))
 def test_process_step_in_batch_filter_docs_to_apply_if_has_attr_step(self):
     step_runner = mock.MagicMock(step=PreProcessSteps.tokenization,
                                  override=False,
                                  increment=False)
     all_docs = [object() for i in range(5)]
     self.patch_object(DocumentManager, '__iter__', return_value=all_docs)
     dm_get_docs = self.patch_object(DocumentManager,
                                     'get_documents_lacking_preprocess',
                                     return_value=all_docs[:2])
     # Ok, docs manager has 5 docs, but get_documents_lacking_preprocess will return
     # only 2 of them
     p = PreProcessPipeline([step_runner], DocumentManager())
     p.process_step_in_batch(step_runner)
     dm_get_docs.assert_called_once_with(step_runner.step)
     self.assertNotEqual(step_runner.call_count, 5)
     self.assertEqual(step_runner.call_count, 2)
     self.assertEqual(step_runner.call_args_list,
                      [mock.call(d) for d in all_docs[:2]])
class TestDocumentCreationThruManager(ManagerTestCase):
    sample_id = 'sample-id'
    sample_text = 'this is a sample text'
    sample_metadata = {'iepy': 'rocks'}
    docmanager = DocumentManager()

    def test_create_basic(self):
        doc = self.docmanager.create_document(self.sample_id, self.sample_text,
                                              self.sample_metadata)
        self.assertEqual(doc.human_identifier, self.sample_id)
        self.assertEqual(doc.text, self.sample_text)
        self.assertEqual(doc.metadata, self.sample_metadata)
        self.assertEqual(IEDocument.objects.count(), 1)

    def test_create_existent_does_nothing(self):
        doc = self.docmanager.create_document(self.sample_id, self.sample_text,
                                              self.sample_metadata)
        doc2 = self.docmanager.create_document(self.sample_id,
                                               self.sample_text,
                                               self.sample_metadata)
        self.assertEqual(doc, doc2)
        self.assertEqual(IEDocument.objects.count(), 1)

    def test_doc_text_and_metadata_are_updated_if_enabled(self):
        new_text = self.sample_text + ' but longer'
        new_metadata = {'something': 'different'}
        self.docmanager.create_document(self.sample_id, self.sample_text,
                                        self.sample_metadata)
        doc = self.docmanager.create_document(self.sample_id, new_text,
                                              new_metadata)
        self.assertNotEqual(doc.text, new_text)
        self.assertEqual(doc.text, self.sample_text)
        self.assertNotEqual(doc.metadata, new_metadata)
        self.assertEqual(doc.metadata, self.sample_metadata)
        doc = self.docmanager.create_document(self.sample_id,
                                              new_text,
                                              new_metadata,
                                              update_mode=True)
        self.assertEqual(doc.text, new_text)
        self.assertEqual(doc.metadata, new_metadata)
Example #8
0
from docopt import docopt

from iepy.data.db import DocumentManager

if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    opts = docopt(__doc__, version=0.1)

    name = opts["<filename>"]
    if name.endswith(".gz"):
        fin = gzip.open(name, "rt")
    else:
        fin = open(name, "rt")
    reader = csv.DictReader(fin)
    name = os.path.basename(name)

    docdb = DocumentManager()

    seen = set()
    for i, d in enumerate(reader):
        mid = d["freebase_mid"]
        if mid in seen:
            continue
        seen.add(mid)
        docdb.create_document(identifier=mid,
                              text=d["description"],
                              metadata={"input_filename": name})
Example #9
0
    preprocess.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
import logging

from docopt import docopt

import iepy

iepy.setup(__file__)
from iepy.data.db import DocumentManager
from iepy.preprocess.stanford_preprocess import StanfordPreprocess
from iepy.preprocess.pipeline import PreProcessPipeline
from iepy.preprocess.segmenter import SyntacticSegmenterRunner

if __name__ == '__main__':
    logger = logging.getLogger(u'preprocess')
    logger.setLevel(logging.INFO)
    logging.basicConfig(
        level=logging.INFO,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    opts = docopt(__doc__, version=0.1)
    docs = DocumentManager()
    pipeline = PreProcessPipeline(
        [StanfordPreprocess(),
         SyntacticSegmenterRunner(increment=True)], docs)
    pipeline.process_everything()
Example #10
0
def get_episode(pages_dict, number_of_seasons, all_tag, season_tag_pattern):
    candidates = [pa for pa in pages_dict.values() if has_category_tag(pa, all_tag)]
    per_season = []
    for i in range(1, number_of_seasons + 1):
        season_tag = season_tag_pattern % i
        season_ep = [pa for pa in candidates if has_category_tag(pa, season_tag)]
        per_season.append(season_ep)
    return per_season

if __name__ == '__main__':
    logging.basicConfig()
    logger = logging.getLogger('wikia_to_iepy')
    logger.setLevel(logging.DEBUG)
    opts = docopt(__doc__, version=0.1)
    docs = DocumentManager()
    pages_dict = build_pages_dict(opts['<wikia_zipped_xml_dump_file>'])
    eps = get_episode(pages_dict, int(opts['<nr_of_seasons>']),
                      opts['--all-episodes-tag'],
                      opts['--season-tag-pattern'])
    for season_nr, season in enumerate(eps, 1):
        issues_counter = 0
        for i, e in enumerate(season):
            try:
                docs.create_document(
                    identifier=e['title'],
                    text='',
                    metadata={
                        'raw_text': e['revision']['text']['#text'],
                        'season': season_nr,
                        'source': opts['<wikia_zipped_xml_dump_file>']
Example #11
0
from docopt import docopt

from iepy.data.db import DocumentManager


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO,
                        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    opts = docopt(__doc__, version=0.1)

    name = opts["<filename>"]
    if name.endswith(".gz"):
        fin = gzip.open(name, "rt")
    else:
        fin = open(name, "rt")
    reader = csv.DictReader(fin)
    name = os.path.basename(name)

    docdb = DocumentManager()

    seen = set()
    for i, d in enumerate(reader):
        mid = d["freebase_mid"]
        if mid in seen:
            continue
        seen.add(mid)
        docdb.create_document(identifier=mid,
                              text=d["description"],
                              metadata={"input_filename": name})