Esempio n. 1
0
# -*- coding: utf-8 -*-

from SeaCOW import Query, ConcordanceLoader
import json  # Just for pretty-printing.

# See sample.py for annotations of these attributes.
q = Query()
q.corpus = 'decow16a-nano'
q.string = '[word="Gartenzwerg"]'
q.max_hits = 10
q.attributes = ['word', 'tag']
q.structures = ['s']
q.references = [
    'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type'
]
q.container = 's'
q.set_deduplication()

# The concordance loader has just one settable attribute.
p = ConcordanceLoader()
p.full_structure = True  # Convert token attributes to dicts as well, otherwise |-separated.
q.processor = p
q.run()

# Now you have a nice structured Python object in p.concordance.

# The json library just provides a convenient way of displaying the
# resulting structures.
print json.dumps(p.concordance[0:2], sort_keys=False, indent=2)
Esempio n. 2
0
# -*- coding: utf-8 -*-

# This dumps a very raw concordance format.
# It's very efficient, though.
# See samply.py for options.

from SeaCOW import Query, ConcordanceDumper

q = Query()
q.corpus          = 'decow16b'
q.string          = '[word="Holzweg"]'
q.max_hits        = 10
q.attributes      = ['word']
q.structures      = ['s']
q.references      = ['doc.url', 'doc.id', 's.idx']
q.container       = 's'
q.set_deduplication()

p                 = ConcordanceDumper()
p.filename        = 'output/holzweg.txt'
q.processor       = p
q.run()
Esempio n. 3
0
# -*- coding: utf-8 -*-

from SeaCOW import Query, Nonprocessor

# Create a Query object and set whatever needs to be set.
q = Query()
q.corpus = 'decow16a-nano'  # Lower-case name of the corpusto use.
q.string = '[word="Gartenzwerg"]'  # A normal CQL string as used in NoSketchEngine.
q.max_hits = -1  # Maximal number of hits to return. Ignored for Nonprocessor.
q.attributes = []  # For counting, you don't need word attributes.
q.structures = []  # ... you don't need structural attributes.
q.references = []  # ... you don't need reference attrs.
q.container = 's'  # Which container structure should be used?

# Using the deduplicator would NOT change the outcome. Switch off.
q.set_deduplication(off=True)

# Create a Processor object and attach it to the Query object.
# The Nonprocessor processor does nothing. You can work with the results
# yourself in the finalise method or just get the hits value from the
# query object. It is the concordance as seported by Manatee.
p = Nonprocessor()  # Create a processor object of apporpriate type.
q.processor = p  # Attach the processor to the query.
q.run()  # Run the query.

print('Query was: %s' % (q.string))
print('Corpus used: %s' % (q.corpus))
print('Query returned %d hits.' % (q.hits))
Esempio n. 4
0
# -*- coding: utf-8 -*-

import random
from SeaCOW import Query, ConcordanceWriter, DependencyBuilder

random.seed(2914)

q = Query()
q.corpus          = 'precox20lda25'
q.string          = '<doc id="[0-9a-f].+">'
q.random_subset   =  0.09
q.attributes      = ['word']
q.structures      = ['s.idx', 'div.bpc', 'doc.bdc', 'doc.url', 'doc.id', 'doc.pregister', 'doc.pregbrob']
q.references      = ['doc.url', 'doc.id']
q.container       = 'doc'

p                 = ConcordanceWriter()
p.filename        = 'sample.csv'
q.processor       = p
q.run()