# -*- coding: utf-8 -*- from SeaCOW import Query, Nonprocessor # Create a Query object and set whatever needs to be set. q = Query() q.corpus = 'decow16a-nano' # Lower-case name of the corpusto use. q.string = '[word="Gartenzwerg"]' # A normal CQL string as used in NoSketchEngine. q.max_hits = -1 # Maximal number of hits to return. Ignored for Nonprocessor. q.attributes = [] # For counting, you don't need word attributes. q.structures = [] # ... you don't need structural attributes. q.references = [] # ... you don't need reference attrs. q.container = 's' # Which container structure should be used? # Using the deduplicator would NOT change the outcome. Switch off. q.set_deduplication(off=True) # Create a Processor object and attach it to the Query object. # The Nonprocessor processor does nothing. You can work with the results # yourself in the finalise method or just get the hits value from the # query object. It is the concordance as seported by Manatee. p = Nonprocessor() # Create a processor object of apporpriate type. q.processor = p # Attach the processor to the query. q.run() # Run the query. print('Query was: %s' % (q.string)) print('Corpus used: %s' % (q.corpus)) print('Query returned %d hits.' % (q.hits))
# -*- coding: utf-8 -*- from SeaCOW import Query, ConcordanceLoader import json # Just for pretty-printing. # See sample.py for annotations of these attributes. q = Query() q.corpus = 'decow16a-nano' q.string = '[word="Gartenzwerg"]' q.max_hits = 10 q.attributes = ['word', 'tag'] q.structures = ['s'] q.references = [ 'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type' ] q.container = 's' q.set_deduplication() # The concordance loader has just one settable attribute. p = ConcordanceLoader() p.full_structure = True # Convert token attributes to dicts as well, otherwise |-separated. q.processor = p q.run() # Now you have a nice structured Python object in p.concordance. # The json library just provides a convenient way of displaying the # resulting structures. print json.dumps(p.concordance[0:2], sort_keys=False, indent=2)
# -*- coding: utf-8 -*- from SeaCOW import Query, ConcordanceWriter, DependencyBuilder # Create a Query object and set whatever needs to be set. q = Query() q.corpus = 'decow16a-nano' # Lower-case name of the corpusto use. q.string = '[word="Motorsäge"]' # A normal CQL string as used in NoSketchEngine. q.max_hits = 10 # Maximal number of hits to return. Use when testing queries! q.structures = ['s'] # Structure markup to export from corpus. q.references = [ 'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type' ] # Which reference attributes (of structures) to export. q.container = 's' # Which container strutcure should be exported? q.set_deduplication() # Use deduplication. # The dependency attributes are needed for DependencyBuilder() to work. q.attributes = ['word', 'depind', 'dephd', 'deprel', 'tag', 'lemma'] # The dependency builder reconstructs (and outputs) dependency trees. # If you want to filter structures, create a class which inherits from # DependencyBuilder and override the filtre() method (NO TYPO there). p = DependencyBuilder() # The following five are 0-based indeces into q.attributes as defined above. p.column_token = 0 # Which column contains the token? p.column_index = 1 # Which column contains the dependency index? p.column_head = 2 # Which column contains the dependency head index? p.column_relation = 3 # Which column contains the dependency relation? p.attribs = [4,
# -*- coding: utf-8 -*- from SeaCOW import Query, ConcordanceWriter, DependencyBuilder from iterated import IterativelyFiltredDependencyBuilder # FIRST QUERY. # Create a Query object and set whatever needs to be set. q = Query() q.corpus = 'encow16a-nano' q.string = '[word="give" & tag="VB[ZPD]"]' q.max_hits = 100 q.attributes = ['word', 'depind', 'dephd', 'deprel', 'tag', 'lemma'] q.structures = ['s'] q.references = [ 'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type' ] q.container = 's' # This enables an efficient duplicate remover using a scaling Bloom filter. q.set_deduplication() p = IterativelyFiltredDependencyBuilder() p.column_token = 0 p.column_index = 1 p.column_head = 2 p.column_relation = 3 p.attribs = [4, 5] p.fileprefix = 'give_iterated' p.savejson = True p.saveimage = 'png'
# -*- coding: utf-8 -*- # This dumps a very raw concordance format. # It's very efficient, though. # See samply.py for options. from SeaCOW import Query, ConcordanceDumper q = Query() q.corpus = 'decow16b' q.string = '[word="Holzweg"]' q.max_hits = 10 q.attributes = ['word'] q.structures = ['s'] q.references = ['doc.url', 'doc.id', 's.idx'] q.container = 's' q.set_deduplication() p = ConcordanceDumper() p.filename = 'output/holzweg.txt' q.processor = p q.run()
# -*- coding: utf-8 -*- import random from SeaCOW import Query, ConcordanceWriter, DependencyBuilder random.seed(2914) q = Query() q.corpus = 'precox20lda25' q.string = '<doc id="[0-9a-f].+">' q.random_subset = 0.09 q.attributes = ['word'] q.structures = ['s.idx', 'div.bpc', 'doc.bdc', 'doc.url', 'doc.id', 'doc.pregister', 'doc.pregbrob'] q.references = ['doc.url', 'doc.id'] q.container = 'doc' p = ConcordanceWriter() p.filename = 'sample.csv' q.processor = p q.run()
# -*- coding: utf-8 -*- # This dumps a very raw concordance format. # It's very efficient, though. # See samply.py for options. from SeaCOW import Query, ConcordanceDumper q = Query() q.corpus = 'encow16a-nano' q.string = '[tag="N."][word="attention"]' q.max_hits = -1 q.attributes = ['word'] q.structures = ['s'] q.references = ['doc.url', 'doc.id', 's.idx'] q.container = 's' q.set_deduplication() p = ConcordanceDumper() p.filename = 'dump.txt' q.processor = p q.run()