예제 #1
0
# -*- coding: utf-8 -*-

from SeaCOW import Query, ConcordanceLoader
import json  # Just for pretty-printing.

# See sample.py for annotations of these attributes.
q = Query()
q.corpus = 'decow16a-nano'
q.string = '[word="Gartenzwerg"]'
q.max_hits = 10
q.attributes = ['word', 'tag']
q.structures = ['s']
q.references = [
    'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type'
]
q.container = 's'
q.set_deduplication()

# The concordance loader has just one settable attribute.
p = ConcordanceLoader()
p.full_structure = True  # Convert token attributes to dicts as well, otherwise |-separated.
q.processor = p
q.run()

# Now you have a nice structured Python object in p.concordance.

# The json library just provides a convenient way of displaying the
# resulting structures.
print json.dumps(p.concordance[0:2], sort_keys=False, indent=2)
예제 #2
0
# -*- coding: utf-8 -*-

from SeaCOW import Query, ConcordanceWriter, DependencyBuilder

# Create a Query object and set whatever needs to be set.
q = Query()
q.corpus = 'decow16a-nano'  # Lower-case name of the corpusto use.
q.string = '[word="Motorsäge"]'  # A normal CQL string as used in NoSketchEngine.
q.max_hits = 10  # Maximal number of hits to return. Use when testing queries!
q.structures = ['s']  # Structure markup to export from corpus.
q.references = [
    'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type'
]
# Which reference attributes (of structures) to export.
q.container = 's'  # Which container strutcure should be exported?
q.set_deduplication()  # Use deduplication.

# The dependency attributes are needed for DependencyBuilder() to work.
q.attributes = ['word', 'depind', 'dephd', 'deprel', 'tag', 'lemma']

# The dependency builder reconstructs (and outputs) dependency trees.
# If you want to filter structures, create a class which inherits from
# DependencyBuilder and override the filtre() method (NO TYPO there).
p = DependencyBuilder()

# The following five are 0-based indeces into q.attributes as defined above.
p.column_token = 0  # Which column contains the token?
p.column_index = 1  # Which column contains the dependency index?
p.column_head = 2  # Which column contains the dependency head index?
p.column_relation = 3  # Which column contains the dependency relation?
p.attribs = [4,
예제 #3
0
# -*- coding: utf-8 -*-

from SeaCOW import Query, Nonprocessor

# Create a Query object and set whatever needs to be set.
q = Query()
q.corpus = 'decow16a-nano'  # Lower-case name of the corpusto use.
q.string = '[word="Gartenzwerg"]'  # A normal CQL string as used in NoSketchEngine.
q.max_hits = -1  # Maximal number of hits to return. Ignored for Nonprocessor.
q.attributes = []  # For counting, you don't need word attributes.
q.structures = []  # ... you don't need structural attributes.
q.references = []  # ... you don't need reference attrs.
q.container = 's'  # Which container structure should be used?

# Using the deduplicator would NOT change the outcome. Switch off.
q.set_deduplication(off=True)

# Create a Processor object and attach it to the Query object.
# The Nonprocessor processor does nothing. You can work with the results
# yourself in the finalise method or just get the hits value from the
# query object. It is the concordance as seported by Manatee.
p = Nonprocessor()  # Create a processor object of apporpriate type.
q.processor = p  # Attach the processor to the query.
q.run()  # Run the query.

print('Query was: %s' % (q.string))
print('Corpus used: %s' % (q.corpus))
print('Query returned %d hits.' % (q.hits))
예제 #4
0
# -*- coding: utf-8 -*-

# This dumps a very raw concordance format.
# It's very efficient, though.
# See samply.py for options.

from SeaCOW import Query, ConcordanceDumper

q = Query()
q.corpus = 'encow16a-nano'
q.string = '[tag="N."][word="attention"]'
q.max_hits = -1
q.attributes = ['word']
q.structures = ['s']
q.references = ['doc.url', 'doc.id', 's.idx']
q.container = 's'
q.set_deduplication()

p = ConcordanceDumper()
p.filename = 'dump.txt'
q.processor = p
q.run()