pgpass_file=args.pgpass,
                          schema=schema,
                          role=args.role)

condition = SQL('')
if args.chunk_column:
    condition = SQL('where {}={}').format(Identifier(args.chunk_column),
                                          Literal(args.chunk_value))

with storage.conn.cursor() as c:
    c.execute(
        SQL('SELECT count({}) FROM {}.{}').format(Identifier(source_id),
                                                  Identifier(source_schema),
                                                  Identifier(source_table)))
    total = c.fetchone()[0]
    logger.debug('total number of rows in the source table: {}'.format(total))
    c.execute(
        SQL('SELECT count(DISTINCT {}) FROM {}.{}').format(
            Identifier(source_id), Identifier(source_schema),
            Identifier(source_table)))
    distinct = c.fetchone()[0]
    if total != distinct:
        logger.error(
            'values in the source table column {!r} are not unique, {} distinct values in total'
            .format(source_id, distinct))
        exit(1)

    if args.chunk_column is not None:
        c.execute(
            SQL('SELECT count({}) FROM {}.{} {}').format(
                Identifier(source_id), Identifier(source_schema),
Exemple #2
0
def process_files(rootdir, doc_iterator, collection, focus_input_files=None,\
                  encoding='utf-8', create_empty_docs=False, logger=None, \
                  tokenization=None, use_sentence_sep_newlines=False, \
                  orig_tokenization_layer_name_prefix='', \
                  splittype='no_splitting', metadata_extent='complete', \
                  insert_query_size = 5000000, \
                  skippable_documents=None ):
    """ Uses given doc_iterator (iter_packed_xml or iter_unpacked_xml) to
        extract texts from the files in the folder root_dir.
        Optionally, adds tokenization layers to created Text objects.
        Extracted Text objects will be stored in given PostgreSQL 
        collection.
    
        Parameters
        ----------
        root_dir: str
            The root directory which contains XML TEI files that 
            doc_iterator can extract;
        doc_iterator: iter_packed_xml or iter_unpacked_xml
            Iterator function that can extract Text objects from 
            (packed or unpacked) files in the root_dir;
        collection:  estnltk.storage.postgres.collection.PgCollection
            EstNLTK's PgCollection where extracted Texts should be 
            stored;
        focus_input_files: set of str
            Set of input XML files that should be exclusively
            processed from root_dir. If provided, then only files
            from the set will be processed, and all other files 
            will be skipped.
            If None, then all files returned by doc_iterator will
            be processed.
        encoding: str
            Encoding of the XML files. (default: 'utf-8')
        create_empty_docs: boolean
            If True, then documents are also created if there is no 
            textual content, but only metadata content.
            (default: False)
        logger: logging.Logger
            Logger used for debugging messages;
        tokenization: ['none', 'preserve', 'estnltk']
            specifies if tokenization will be added to Texts, and if 
            so, then how it will be added. 
            * 'none'     -- text   will   be   created  without  any 
                            tokenization layers;
            * 'preserve' -- original tokenization from XML files will 
                            be preserved in layers of the text; 
            * 'estnltk'  -- text's original tokenization will be 
                            overwritten by estnltk's tokenization;
        orig_tokenization_layer_name_prefix: str
            Prefix that will be added to names of layers of original 
            tokenization, if tokenization=='preserve'. 
            (Default: '')
        use_sentence_sep_newlines: boolean
            If set, then during the reconstruction of a text string, 
            sentences from the original XML mark-up will always 
            separated from each other by a newline, regardless the 
            tokenization option used.  As a result, sentence endings 
            can also be noticed in the reconstructed text string.
            Otherwise, a single space character will be used as a 
            sentence separator.
            (default: False)
        splittype: ['no_splitting', 'sentences', 'paragraphs']
            specifies if and how texts should be split before inserting
            into the database:
            * 'no_splitting' -- insert full texts, do no split;
            * 'sentences'    -- split into sentences (a Text object 
                                for each sentence), and insert 
                                sentences into database;
            * 'paragraphs'   -- split into paragraphs (a Text object 
                                for each paragraph), and insert 
                                paragraphs into database;
        metadata_extent: ['minimal', 'complete']
            specifies to which extent created Text object should be 
            populated with metadata. 
            (default: 'complete')
        insert_query_size: int (default: 5000000)
            maximum insert query size used during the database insert;
        skippable_documents: set of str (default: None)
            A set of XML document names corresponding to the documents 
            that have already been processed and inserted into the 
            database. All documents inside this set will skipped.
            An XML document name is a string in the format:
                    XML_file_name + ':' + 
                    subdocument_number + ':' + 
                    paragraph_number + ':' + 
                    sentence_number
            Paragraph_number and sentence_number can be missing, if the
            database does not contain the corresponding fields.
            If skippable_documents is None or empty, all processed files 
            will be inserted into the database.
            Note: skippable_documents is more fine-grained set than 
            focus_input_files, thus overrides the skipping directed by
            the later set.
    """

    global special_tokens_tagger
    global special_compound_tokens_tagger
    global special_sentence_tokenizer
    assert doc_iterator in [iter_unpacked_xml, iter_packed_xml]
    assert tokenization in [None, 'none', 'preserve', 'estnltk']
    assert splittype in ['no_splitting', 'sentences', 'paragraphs']
    assert metadata_extent in ['minimal', 'complete']
    add_tokenization = False
    preserve_tokenization = False
    paragraph_separator = '\n\n'
    if skippable_documents == None:
        skippable_documents = set()
    if tokenization:
        if tokenization == 'none':
            tokenization = None
        if tokenization == 'preserve':
            add_tokenization = True
            preserve_tokenization = True
        elif tokenization == 'estnltk':
            add_tokenization = True
            preserve_tokenization = False
    sentence_separator = ' '
    if use_sentence_sep_newlines:
        sentence_separator = '\n'
    # Choose how the loaded document will be
    # split before the insertion
    split = to_text
    if args.splittype == 'no_splitting':
        split = partial(to_text,
                        layer_prefix=orig_tokenization_layer_name_prefix)
    elif args.splittype == 'sentences':
        split = partial(to_sentences,
                        layer_prefix=orig_tokenization_layer_name_prefix)
    elif args.splittype == 'paragraphs':
        split = partial(to_paragraphs,
                        layer_prefix=orig_tokenization_layer_name_prefix)
    last_xml_file = ''
    doc_id = 1
    total_insertions = 0
    xml_files_processed = 0
    with collection.insert(
            query_length_limit=insert_query_size) as buffered_insert:
        for doc in doc_iterator(rootdir, focus_input_files=focus_input_files, encoding=encoding, \
                                create_empty_docs=create_empty_docs, \
                                orig_tokenization_layer_name_prefix=orig_tokenization_layer_name_prefix, \
                                add_tokenization=add_tokenization, preserve_tokenization=preserve_tokenization,\
                                sentence_separator=sentence_separator, paragraph_separator=paragraph_separator):
            # Get subcorpus name
            subcorpus = ''
            if '_xml_file' in doc.meta:
                subcorpus = get_text_subcorpus_name(None,
                                                    doc.meta['_xml_file'],
                                                    doc,
                                                    expand_names=False)
            # Reset the document counter if we have a new file coming up
            xml_file = doc.meta.get('_xml_file', '')
            if last_xml_file != xml_file:
                doc_nr = 1
            # Split the loaded document into smaller units if required
            for doc_fragment, para_nr, sent_nr in split(doc):
                meta = {}
                # Gather metadata
                # 1) minimal metadata:
                meta['file'] = xml_file
                doc_fragment.meta['file'] = meta['file']
                # Remove redundant attribute '_xml_file'
                if doc_fragment.meta.get('_xml_file', '') == meta['file']:
                    del doc_fragment.meta['_xml_file']
                doc_fragment.meta['subcorpus'] = subcorpus
                meta['subcorpus'] = subcorpus
                if para_nr is not None:
                    meta['document_nr'] = doc_nr
                    doc_fragment.meta['doc_nr'] = doc_nr
                    meta['paragraph_nr'] = para_nr
                    doc_fragment.meta['para_nr'] = para_nr
                if sent_nr is not None:
                    meta['sentence_nr'] = sent_nr
                    doc_fragment.meta['sent_nr'] = sent_nr
                # 2) complete metadata:
                if metadata_extent == 'complete':
                    for key, value in doc.meta.items():
                        doc_fragment.meta[key] = value
                    # Collect remaining metadata
                    for key in ['title', 'type']:
                        meta[key] = doc_fragment.meta.get(key, '')
                # Create an identifier of the insertable chunk:
                #  XML file + subdocument nr + paragraph nr + sentence nr
                file_chunk_lst = [meta['file']]
                file_chunk_lst.append(':')
                file_chunk_lst.append(str(doc_nr))
                if 'paragraph_nr' in meta:
                    file_chunk_lst.append(':')
                    file_chunk_lst.append(str(meta['paragraph_nr']))
                if 'sentence_nr' in meta:
                    file_chunk_lst.append(':')
                    file_chunk_lst.append(str(meta['sentence_nr']))
                file_chunk_str = ''.join(file_chunk_lst)
                # Finally, insert document (if not skippable)
                if file_chunk_str not in skippable_documents:
                    row_id = buffered_insert(text=doc_fragment, meta_data=meta)
                    total_insertions += 1
                if logger:
                    # Debugging stuff
                    # Listing of annotation layers added to Text
                    with_layers = list(doc_fragment.layers)
                    if with_layers:
                        with_layers = ' with layers ' + str(with_layers)
                    else:
                        with_layers = ''
                    if file_chunk_str not in skippable_documents:
                        logger.debug((' {} inserted as Text{}.').format(
                            file_chunk_str, with_layers))
                    else:
                        logger.debug((' {} skipped (already in the database).'
                                      ).format(file_chunk_str))
                    #logger.debug('  Metadata: {}'.format(doc_fragment.meta))
            doc_nr += 1
            if last_xml_file != xml_file:
                xml_files_processed += 1
            last_xml_file = xml_file
            #print('.', end = '')
            #sys.stdout.flush()
    if logger:
        logger.info(
            'Total {} XML files processed.'.format(xml_files_processed))
        logger.info(
            'Total {} estnltk texts inserted into the database.'.format(
                total_insertions))
Exemple #3
0
def fetch_skippable_documents(storage, schema, collection, meta_fields,
                              logger):
    """ Fetches names of existing / skippable documents from the PostgreSQL storage.
        Returns a set of existing document names.
        A document name is represented as a string in the format:
               XML_file_name + ':' + 
               subdocument_number + ':' + 
               paragraph_number + ':' + 
               sentence_number
        Paragraph_number and sentence_number are skipped, if they are not in 
        meta_fields.
        
        Parameters
        ----------
        storage: PostgresStorage
            PostgresStorage to be queried for column names of the collection;
        schema: str
            Name of the schema;
        collection: boolean
            Name of the collection / db table;
        meta_fields: OrderedDict
            Current fields of the collection / database table. 
        logger: logger
            For logging the stuff.
        
        Returns
        -------
        set of str
            Set of document names corresponding to documents already existing in 
            the collection;
    """
    # Filter fields: keep only fields that correspond to the fields of
    # the current table
    query_fields = ['file', 'id', 'document_nr', 'paragraph_nr', 'sentence_nr']
    query_fields = [
        f for f in query_fields if f == 'id' or f in meta_fields.keys()
    ]
    prev_fname = None
    fname_doc_nr = 1
    file_chunks_in_db = set()
    # Construct the query
    sql_str = 'SELECT ' + (','.join(
        query_fields)) + ' FROM {}.{} ORDER BY ' + (','.join(query_fields))
    with storage.conn as conn:
        # Named cursors: http://initd.org/psycopg/docs/usage.html#server-side-cursors
        with conn.cursor('read_fname_chunks', withhold=True) as read_cursor:
            try:
                read_cursor.execute(
                    SQL(sql_str).format(Identifier(schema),
                                        Identifier(collection)))
            except Exception as e:
                logger.error(e)
                raise
            finally:
                logger.debug(read_cursor.query.decode())
            for items in read_cursor:
                fname = items[0]
                doc_id = items[1]
                if prev_fname and prev_fname != fname:
                    # Reset document number (in case of a new file)
                    fname_doc_nr = 1
                doc_nr = items[
                    2] if 'document_nr' in query_fields else fname_doc_nr
                paragraph_nr = items[
                    3] if 'paragraph_nr' in query_fields else None
                sentence_nr = items[
                    4] if 'sentence_nr' in query_fields else None
                # Reconstruct file name chunk
                file_chunk_lst = [fname]
                file_chunk_lst.append(':')
                file_chunk_lst.append(str(doc_nr))
                if paragraph_nr:
                    file_chunk_lst.append(':')
                    file_chunk_lst.append(str(paragraph_nr))
                if sentence_nr:
                    file_chunk_lst.append(':')
                    file_chunk_lst.append(str(sentence_nr))
                file_chunk_str = ''.join(file_chunk_lst)
                # Sanity check: file_chunk_str should be unique
                # if not, then we cannot expect skipping to be
                # consistent ...
                assert file_chunk_str not in file_chunks_in_db, \
                    ' (!) Document chunk {!r} appears more than once in database.'.format(file_chunk_str)
                file_chunks_in_db.add(file_chunk_str)
                prev_fname = fname
                fname_doc_nr += 1
    return file_chunks_in_db
Exemple #4
0
def process_files(in_file, collection, focus_doc_ids=None,\
                  encoding='utf-8', discard_empty_paragraphs=True, logger=None, \
                  tokenization=None, insert_query_size = 5000000, \
                  skippable_documents=None, doc_id_to_texttype=None ):
    """ Reads etTenTen 2013 corpus from in_file, extracts documents and 
        reconstructs corresponding Text objects, and stores the results 
        in given database collection.
        Optionally, adds tokenization layers to created Text objects.
    
        Parameters
        ----------
        in_file: str
           Full name of etTenTen corpus file (name with path);
        collection:  estnltk.storage.postgres.collection.PgCollection
            EstNLTK's PgCollection where extracted Texts should be 
            stored;
        focus_doc_ids: set of str
            Set of document id-s corresponding to the documents which 
            need to be extracted from the in_file.
            If provided, then only documents with given id-s will be 
            processed, and all other documents will be skipped.
            If None or empty, then all documents in the file will be 
            processed;
        encoding: str
            Encoding of in_file. Defaults to 'utf-8';
        discard_empty_paragraphs: boolean
            If set, then empty paragraphs will be discarded.
            (default: True)
        logger: logging.Logger
            Logger used for debugging etc messages;
        tokenization: ['none', 'preserve', 'estnltk']
            specifies if tokenization will be added to Texts, and if 
            so, then how it will be added. 
            * 'none'     -- text   will   be   created  without  any 
                            tokenization layers;
            * 'preserve' -- original tokenization from XML files will 
                            be preserved in layers of the text. Note
                            that etTenTen only has original tokenization 
                            for paragraphs, and thus Texts will only have
                            original_paragraphs layer, nothing more.
            * 'estnltk'  -- text's original tokenization will be 
                            overwritten by estnltk's tokenization;
        insert_query_size: int (default: 5000000)
            maximum insert query size used during the database insert;
        skippable_documents: set of str (default: None)
            A set of web document ids corresponding to the documents 
            that have already been processed and inserted into the 
            database. All documents inside this set will skipped.
            A web document is a string in the format:
               original_doc_id + ':' + 
               subdocument_number + ':' + 
               paragraph_number + ':' + 
               sentence_number
            Subdocument_number, paragraph_number and sentence_number are 
            skipped, if the database does not contain the corresponding 
            fields.
            If skippable_documents is None or empty, all processed files 
            will be inserted into the database.
            Note: skippable_documents is more fine-grained set than 
            focus_doc_ids, thus overrides the skipping directed by
            the later set.
        doc_id_to_texttype: dict (default: None)
            A mapping from document ids (strings) to their texttypes.
            Should cover all documents listed in focus_doc_ids, or
            if focus_doc_ids==None, all documents in in_file;
    """
    assert tokenization in [None, 'none', 'preserve', 'estnltk']
    add_tokenization = False
    preserve_tokenization = False
    if skippable_documents == None:
        skippable_documents = set()
    if tokenization:
        if tokenization == 'none':
            add_tokenization = False
            preserve_tokenization = False
        if tokenization == 'preserve':
            add_tokenization = False
            preserve_tokenization = True
        elif tokenization == 'estnltk':
            add_tokenization = True
            preserve_tokenization = False

    doc_nr = 1
    last_original_doc_id = None
    total_insertions = 0
    docs_processed = 0
    with collection.insert(
            query_length_limit=insert_query_size) as buffered_insert:
        for web_doc in parse_ettenten_corpus_file_iterator( in_file, encoding=encoding, \
                                              focus_doc_ids=focus_doc_ids, \
                                              discard_empty_paragraphs=discard_empty_paragraphs, \
                                              add_tokenization=add_tokenization, \
                                              store_paragraph_attributes=True, \
                                              paragraph_separator='\n\n' ):
            # Rename id to original_doc_id (to avoid confusion with DB id-s)
            original_doc_id = web_doc.meta.get('id')
            web_doc.meta['original_doc_id'] = original_doc_id
            del web_doc.meta['id']

            # Reset subdocument counter (if required)
            if last_original_doc_id != original_doc_id:
                doc_nr = 1

            # Delete original_paragraphs layer (if tokenization == None)
            if not add_tokenization and not preserve_tokenization:
                delattr(web_doc,
                        'original_paragraphs')  # Remove layer from the text

            # Add texttype (if mapping is available)
            if doc_id_to_texttype and original_doc_id in doc_id_to_texttype:
                web_doc.meta['texttype'] = doc_id_to_texttype[original_doc_id]

            # Gather metadata
            meta = {}
            for key, value in web_doc.meta.items():
                meta[key] = value

            # Create an identifier of the insertable chunk:
            #  original_doc_id + ':' + subdocument_number (+ ':' + paragraph_number + ':' + sentence_number)
            file_chunk_lst = [web_doc.meta['original_doc_id']]
            file_chunk_lst.append(':')
            file_chunk_lst.append(str(doc_nr))
            file_chunk_str = ''.join(file_chunk_lst)

            # Finally, insert document (if not skippable)
            if file_chunk_str not in skippable_documents:
                row_id = buffered_insert(text=web_doc, meta_data=meta)
                total_insertions += 1
            if logger:
                # Debugging stuff
                # Listing of annotation layers added to Text
                with_layers = list(web_doc.layers)
                if with_layers:
                    with_layers = ' with layers ' + str(with_layers)
                else:
                    with_layers = ''
                if file_chunk_str not in skippable_documents:
                    logger.debug((' {}:{} inserted as Text{}.').format(
                        meta['web_domain'], file_chunk_str, with_layers))
                else:
                    logger.debug(
                        (' {}:{} skipped (already in the database).').format(
                            meta['web_domain'], file_chunk_str))
            doc_nr += 1
            last_original_doc_id = original_doc_id
            docs_processed += 1
            #print('.', end = '')
            #sys.stdout.flush()
    if logger:
        logger.info(
            'Total {} input documents processed.'.format(docs_processed))
        logger.info(
            'Total {} estnltk texts inserted into the database.'.format(
                total_insertions))