Esempio n. 1
0
def tag_collection(tagger, args):
    logger.setLevel(args.logging)
    logger.info('start script')

    storage = PostgresStorage(dbname=args.dbname,
                              user=args.user,
                              host=args.host,
                              port=args.port,
                              pgpass_file=args.pgpass,
                              schema=args.schema,
                              role=args.role)
    collection = storage.get_collection(args.collection)

    overwrite = (args.mode == 'overwrite')

    try:
        collection.create_layer(tagger=tagger,
                                overwrite=overwrite,
                                progressbar=args.progressbar)
    except Exception as e:
        logger.error(e)
        exit(1)
    finally:
        storage.close()

    logger.info('end script')
                             "and then all documents will be processed as whole. "+
                             "(default: False)", \
                        )
    parser.add_argument('-r', '--rand_pick', dest='rand_pick', action='store', type=int, \
                        help="integer value specifying the amount of documents to be randomly chosen for "+\
                             "difference evaluation. if specified, then the given amount of documents will be "+\
                             "processed (instead of processing the whole corpus). if the amount exceeds the "+\
                             "corpus size, then the whole corpus is processed. (default: None)" )
    parser.add_argument('-f', '--file_pick', dest='file_pick', action='store', type=str, \
                        help="name of the file containing indexes of the documents that need to be processed "+\
                             "in the difference evaluation. if specified, then only documents listed in the "+\
                             "file will be processed (instead of processing the whole corpus). note: each "+\
                             "document id must be on a separate line in the index file. (default: None)" )
    args = parser.parse_args()

    logger.setLevel( (args.logging).upper() )
    log = logger
    
    chunk_large_texts = not args.no_chunking
    if not chunk_large_texts:
        log.info(' Chunking of large documents disabled.' )
    
    storage = PostgresStorage(pgpass_file=args.pgpass,
                              schema=args.schema,
                              role=args.role)
    try:

        # Check layer names
        if args.morph_layer == args.new_morph_layer:
            log.error("(!) Invalid layer names: morph_layer cannot be identical to new_morph_layer: {!r}".format(args.morph_layer))
            exit(1)
    nargs='?',
    help='only rows with this value in the chunk column are selected')

args = parser.parse_args()

from collections import OrderedDict
from psycopg2.sql import SQL, Identifier, Literal
import tqdm
from estnltk import Text
from estnltk import logger
from estnltk.converters import dict_to_text
from estnltk.storage.postgres import PostgresStorage
from estnltk.layer_operations import split_by
from estnltk.storage.postgres import table_exists

logger.setLevel(args.logging)

logger.info('start script')

schema = args.schema
source_schema = args.source_schema
source_table = args.source_table
source_id = args.source_id
source_text_column = args.source_text
source_columns = [c.strip() for c in args.source_columns or []]
source_data = args.source_data

assert (args.chunk_column is None) is (args.chunk_value is None), (
    args.chunk_column, args.chunk_value)

collection_columns = [