Esempio n. 1
0
File: utils.py Progetto: lowks/iepy
def csv_to_iepy(filepath):
    print ('Importing Documents to IEPY from {}'.format(filepath))
    from iepy.data.db import DocumentManager

    if filepath.endswith(".gz"):
        fin = gzip.open(filepath, "rt")
    else:
        fin = open(filepath, "rt")
    reader = csv.DictReader(fin)

    expected_fnames = ['document_id', 'document_text']
    if not set(reader.fieldnames).issuperset(expected_fnames):
        msg = "Couldn't find the expected field names on the provided csv {}"
        sys.exit(msg.format(expected_fnames))

    name = os.path.basename(filepath)

    docdb = DocumentManager()
    seen = set()
    for i, d in enumerate(reader):
        doc_id = d["document_id"]
        if doc_id in seen:
            continue
        seen.add(doc_id)
        docdb.create_document(
            identifier=doc_id,
            text=d["document_text"],
            metadata={"input_filename": name},
            update_mode=True
        )
        print ('Added {} documents'.format(i+1))
Esempio n. 2
0
File: utils.py Progetto: 52nlp/iepy
def csv_to_iepy(filepath):
    print ('Importing Documents to IEPY from {}'.format(filepath))
    from iepy.data.db import DocumentManager

    if filepath.endswith(".gz"):
        fin = gzip.open(filepath, "rt")
    else:
        fin = open(filepath, "rt")
    reader = csv.DictReader(fin)

    expected_fnames = ['document_id', 'document_text']
    if not set(reader.fieldnames).issuperset(expected_fnames):
        msg = "Couldn't find the expected field names on the provided csv {}"
        sys.exit(msg.format(expected_fnames))

    name = os.path.basename(filepath)

    docdb = DocumentManager()
    seen = set()

    i = 0
    while True:

        try:
            d = next(reader)
        except StopIteration:
            break
        except csv.Error as error:
            logger.warn("Couldn't load document: {}".format(error))
            continue

        i += 1

        doc_id = d["document_id"]
        if doc_id in seen:
            continue
        seen.add(doc_id)
        docdb.create_document(
            identifier=doc_id,
            text=d["document_text"],
            metadata={"input_filename": name},
            update_mode=True
        )
        print ('Added {} documents'.format(i))
Esempio n. 3
0
def csv_to_iepy(filepath):
    print('Importing Documents to IEPY from {}'.format(filepath))
    from iepy.data.db import DocumentManager

    if filepath.endswith(".gz"):
        fin = gzip.open(filepath, "rt")
    else:
        fin = open(filepath, "rt")
    reader = csv.DictReader(fin)

    expected_fnames = ['document_id', 'document_text']
    if not set(reader.fieldnames).issuperset(expected_fnames):
        msg = "Couldn't find the expected field names on the provided csv {}"
        sys.exit(msg.format(expected_fnames))

    name = os.path.basename(filepath)

    docdb = DocumentManager()
    seen = set()

    i = 0
    while True:

        try:
            d = next(reader)
        except StopIteration:
            break
        except csv.Error as error:
            logger.warn("Couldn't load document: {}".format(error))
            continue

        i += 1

        doc_id = d["document_id"]
        if doc_id in seen:
            continue
        seen.add(doc_id)
        docdb.create_document(identifier=doc_id,
                              text=d["document_text"],
                              metadata={"input_filename": name},
                              update_mode=True)
        print('Added {} documents'.format(i))
Esempio n. 4
0
def csv_to_iepy(filepath):
    logger.info('Importing Documents to IEPY from {}'.format(filepath))
    from iepy.data.db import DocumentManager

    if filepath.endswith(".gz"):
        fin = gzip.open(filepath, "rt")
    else:
        fin = open(filepath, "rt")
    reader = csv.DictReader(fin)
    name = os.path.basename(filepath)

    docdb = DocumentManager()
    seen = set()
    for i, d in enumerate(reader):
        mid = d["freebase_mid"]
        if mid in seen:
            continue
        seen.add(mid)
        docdb.create_document(identifier=mid,
                              text=d["description"],
                              metadata={"input_filename": name})
        logger.info('Added {} documents'.format(i + 1))
Esempio n. 5
0
from docopt import docopt

from iepy.data.db import DocumentManager

if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    opts = docopt(__doc__, version=0.1)

    name = opts["<filename>"]
    if name.endswith(".gz"):
        fin = gzip.open(name, "rt")
    else:
        fin = open(name, "rt")
    reader = csv.DictReader(fin)
    name = os.path.basename(name)

    docdb = DocumentManager()

    seen = set()
    for i, d in enumerate(reader):
        mid = d["freebase_mid"]
        if mid in seen:
            continue
        seen.add(mid)
        docdb.create_document(identifier=mid,
                              text=d["description"],
                              metadata={"input_filename": name})
Esempio n. 6
0
        per_season.append(season_ep)
    return per_season

if __name__ == '__main__':
    logging.basicConfig()
    logger = logging.getLogger('wikia_to_iepy')
    logger.setLevel(logging.DEBUG)
    opts = docopt(__doc__, version=0.1)
    docs = DocumentManager()
    pages_dict = build_pages_dict(opts['<wikia_zipped_xml_dump_file>'])
    eps = get_episode(pages_dict, int(opts['<nr_of_seasons>']),
                      opts['--all-episodes-tag'],
                      opts['--season-tag-pattern'])
    for season_nr, season in enumerate(eps, 1):
        issues_counter = 0
        for i, e in enumerate(season):
            try:
                docs.create_document(
                    identifier=e['title'],
                    text='',
                    metadata={
                        'raw_text': e['revision']['text']['#text'],
                        'season': season_nr,
                        'source': opts['<wikia_zipped_xml_dump_file>']
                    })
            except Exception as err:
                issues_counter += 1
                logger.error('Document not created, %s', err)
                continue
        logger.info('Dumped %i episodes from season %i', len(season) - issues_counter, season_nr)
Esempio n. 7
0
from docopt import docopt

from iepy.data.db import DocumentManager


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO,
                        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    opts = docopt(__doc__, version=0.1)

    name = opts["<filename>"]
    if name.endswith(".gz"):
        fin = gzip.open(name, "rt")
    else:
        fin = open(name, "rt")
    reader = csv.DictReader(fin)
    name = os.path.basename(name)

    docdb = DocumentManager()

    seen = set()
    for i, d in enumerate(reader):
        mid = d["freebase_mid"]
        if mid in seen:
            continue
        seen.add(mid)
        docdb.create_document(identifier=mid,
                              text=d["description"],
                              metadata={"input_filename": name})